/*******************************************************************************
 * Copyright 2020 Intel Corporation.
 *
 *
 * This software and the related documents are Intel copyrighted materials, and your use of them is governed by
 * the express license under which they were provided to you ('License'). Unless the License provides otherwise,
 * you may not use, modify, copy, publish, distribute, disclose or transmit this software or the related
 * documents without Intel's prior written permission.
 * This software and the related documents are provided as is, with no express or implied warranties, other than
 * those that are expressly stated in the License.
 *******************************************************************************/

/*
//
//    Purpose: Computes normalized cross-correlation between an image and a template
//    Contents:
//       ippiCrossCorrNorm_32f_C1R_T
//       ippiCrossCorrNorm_16u32f_C1R_T
//       ippiCrossCorrNorm_8u32f_C1R_T
//       ippiCrossCorrNorm_8u_C1RSfs_T
//
*/
#include "picrosscorrnorm_t.h"

IppStatus owniCrossCorrNormGetSize(OwniCrossCorrConfig *pCfg)
{
    IppStatus status = ippStsNoErr;
    IppiROIShape shape = (IppiROIShape)(pCfg->algType & ippiROIMask);
    IppiNormOp normOp = (IppiNormOp)(pCfg->algType & ippiNormMask);
    IppiSize dstRoiSize, fftS, tplRoiSize, srcRoiSize, frameRoiSize;

    int orderX = 1, orderY = 1;
    int fftSize;

    srcRoiSize = pCfg->srcRoiSize;
    tplRoiSize = pCfg->tplRoiSize;

    switch (shape) {
    case ippiROIFull:
        dstRoiSize.width = pCfg->srcRoiSize.width + tplRoiSize.width - 1;
        dstRoiSize.height = pCfg->srcRoiSize.height + tplRoiSize.height - 1;
        if (dstRoiSize.width * dstRoiSize.height > _CORRFULL_BLOCK_ALG * tplRoiSize.width * tplRoiSize.height) {
            while (1 << orderX < (2 * tplRoiSize.width))
                ++orderX;
            fftS.width = 1 << orderX;
            if ((orderX < _DOUBLE_SIZE) && (fftS.width < dstRoiSize.width))
                fftS.width = 1 << (++orderX);
            while (1 << orderY < (2 * tplRoiSize.height))
                ++orderY;
            fftS.height = 1 << orderY;
            if ((orderY < _DOUBLE_SIZE) && (fftS.height < dstRoiSize.height))
                fftS.height = 1 << (++orderY);
        } else {
            while (1 << orderX < dstRoiSize.width)
                ++orderX;
            while (1 << orderY < dstRoiSize.height)
                ++orderY;
            fftS.width = 1 << orderX;
            fftS.height = 1 << orderY;
        }
        break;
    case ippiROISame:
        dstRoiSize.width = pCfg->srcRoiSize.width;
        dstRoiSize.height = pCfg->srcRoiSize.height;
        if (srcRoiSize.width * srcRoiSize.height > _CORRSAME_BLOCK_ALG * tplRoiSize.width * tplRoiSize.height) {
            while (1 << orderX < (2 * tplRoiSize.width))
                ++orderX;
            fftS.width = 1 << orderX;
            if ((orderX < _DOUBLE_SIZE) && (fftS.width < dstRoiSize.width))
                fftS.width = 1 << (++orderX);
            while (1 << orderY < (2 * tplRoiSize.height))
                ++orderY;
            fftS.height = 1 << orderY;
            if ((orderY < _DOUBLE_SIZE) && (fftS.height < dstRoiSize.height))
                fftS.height = 1 << (++orderY);
        } else {
            while (1 << orderX < srcRoiSize.width)
                ++orderX;
            while (1 << orderY < srcRoiSize.height)
                ++orderY;
            fftS.width = 1 << (++orderX);
            fftS.height = 1 << (++orderY);
        }
        break;
    case ippiROIValid:
        dstRoiSize.width = pCfg->srcRoiSize.width - tplRoiSize.width + 1;
        dstRoiSize.height = pCfg->srcRoiSize.height - tplRoiSize.height + 1;
        if (srcRoiSize.width * srcRoiSize.height > _CORRVALID_N_BLOCK_ALG * tplRoiSize.width * tplRoiSize.height) {
            while (1 << orderX < (2 * tplRoiSize.width))
                ++orderX;
            fftS.width = 1 << orderX;
            if ((orderX < _DOUBLE_SIZE) && (fftS.width < dstRoiSize.width))
                fftS.width = 1 << (++orderX);
            while (1 << orderY < (2 * tplRoiSize.height))
                ++orderY;
            fftS.height = 1 << orderY;
            if ((orderY < _DOUBLE_SIZE) && (fftS.height < dstRoiSize.height))
                fftS.height = 1 << (++orderY);
        } else {
            while (1 << orderX < srcRoiSize.width)
                ++orderX;
            while (1 << orderY < srcRoiSize.height)
                ++orderY;
            fftS.width = 1 << orderX;
            fftS.height = 1 << orderY;
        }
        break;
    default:
        return ippStsAlgTypeErr;
    }

    status = ippiFFTGetSize_R_32f(orderX, orderY, IPP_FFT_DIV_INV_BY_N, ippAlgHintNone, &pCfg->sizeFftSpec, &pCfg->sizeFftSpecBuf, &pCfg->sizeFftBuf);
    if (status != ippStsNoErr)
        return status;
    pCfg->orderX = orderX;
    pCfg->orderY = orderY;

    fftSize = IPP_ALIGNED_SIZE((fftS.width * sizeof(Ipp32f) * fftS.height), IPP_MALLOC_ALIGNED_BYTES);
    pCfg->sizeWrkBuf = 2 * fftSize;

    frameRoiSize.height = fftS.height - tplRoiSize.height + 1;
    frameRoiSize.width = fftS.width - tplRoiSize.width + 1;
    if ((shape == ippiROIFull) && ((fftS.width >= dstRoiSize.width) && (fftS.height >= dstRoiSize.height))) {
        frameRoiSize.width = dstRoiSize.width;
        frameRoiSize.height = dstRoiSize.height;
    }

    switch (normOp) {
    case ippiNormNone:
        pCfg->sizeA = 0;
        break;
    case ippiNorm:
        pCfg->sizeFftBuf = IPP_MAX((pCfg->sizeFftBuf), (int)(frameRoiSize.width * sizeof(Ipp64f)));
        pCfg->sizeA = IPP_ALIGNED_SIZE((frameRoiSize.width * frameRoiSize.height) * sizeof(Ipp32f), IPP_MALLOC_ALIGNED_BYTES);
        break;
    case ippiNormCoefficient:
        pCfg->sizeFftBuf = IPP_MAX((pCfg->sizeFftBuf), (int)(frameRoiSize.width * sizeof(Ipp64f) * 2));
        pCfg->sizeA = IPP_ALIGNED_SIZE((frameRoiSize.width * frameRoiSize.height) * sizeof(Ipp32f), IPP_MALLOC_ALIGNED_BYTES);
        break;
    default:
        return ippStsAlgTypeErr;
    }

    pCfg->fftRoiSize = fftS;
    pCfg->frameRoiSize = frameRoiSize;
    pCfg->dstRoiSize = dstRoiSize;

    return ippStsNoErr;
}

/* //////////////////// Normalization functions ///////////////////// */

#define SQR(x) ((x) * (x))

void owniDenominator_C1R(const Ipp32f *pSrc, int sStep, IppiSize tpl, Ipp32f *pDst, int dStep, IppiSize dstSize, Ipp64f *buf, int bStep,
                         Ipp32f *thresh, Ipp32f *sqrNorm)
{
    int i, j, n, f, l; /* f-irst, l-ast */
    Ipp64f *bufSum = buf;
    Ipp64f *bufSqr = buf + bStep;
    Ipp64f Sum, Sqr;
    Ipp64f inverseSize = 1.0 / (tpl.width * tpl.height);

    /* The first line - special processing */
    Sum = Sqr = 0;
    for (j = 0; j < tpl.height; j++) {
        f = j * sStep;
        for (i = 0; i < tpl.width; i++) {
            Sum += ((Ipp64f)pSrc[i + f]);    /* The first point sum */
            Sqr += SQR((Ipp64f)pSrc[i + f]); /* The first point sqr sum */
        }
    }
    bufSum[0] = Sum;
    bufSqr[0] = Sqr;
    /* Then for the whole line - algorithm of the "running sum" */
    for (n = 1; n < dstSize.width; n++) {
        for (j = 0; j < tpl.height; j++) {
            f = n - 1 + j * sStep; /* index of the current first element */
            l = f + tpl.width;     /* index of the current last element */
            Sum += ((Ipp64f)pSrc[l] - (Ipp64f)pSrc[f]);
            Sqr += (SQR((Ipp64f)pSrc[l]) - SQR((Ipp64f)pSrc[f]));
        }
        bufSum[n] = Sum;
        bufSqr[n] = Sqr;
    }
    for (n = 0; n < dstSize.width; n++) {
        pDst[n] = (Ipp32f)(bufSqr[n] - SQR(bufSum[n]) * inverseSize);
    }

    for (n = 1; n < dstSize.height; n++) {
        j = n * dStep;
        f = (n - 1) * sStep;
        l = (n - 1 + tpl.height) * sStep;
        Sum = Sqr = 0;
        for (i = 0; i < tpl.width; i++) {
            Sum += ((Ipp64f)pSrc[i + l] - (Ipp64f)pSrc[i + f]);           /* The first portion sum update*/
            Sqr += (SQR((Ipp64f)pSrc[i + l]) - SQR((Ipp64f)pSrc[i + f])); /* The first portion sqr sum update*/
        }
        bufSum[0] += Sum;
        bufSqr[0] += Sqr;
        pDst[j] = (Ipp32f)(bufSqr[0] - SQR(bufSum[0]) * inverseSize);
        for (i = 1; i < dstSize.width; i++) {
            Sum += ((Ipp64f)pSrc[l + i + tpl.width - 1] - (Ipp64f)pSrc[i + l - 1] - (Ipp64f)pSrc[f + i + tpl.width - 1] + (Ipp64f)pSrc[i + f - 1]);
            Sqr += (SQR((Ipp64f)pSrc[l + i + tpl.width - 1]) - SQR((Ipp64f)pSrc[i + l - 1]) - SQR((Ipp64f)pSrc[f + i + tpl.width - 1]) +
                    SQR((Ipp64f)pSrc[i + f - 1]));
            bufSum[i] += Sum;
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)(bufSqr[i] - SQR(bufSum[i]) * inverseSize);
        }
    }
    ippiThreshold_Val_32f_C1R(pDst, dStep * sizeof(Ipp32f), pDst, dStep * sizeof(Ipp32f), dstSize, (*thresh) * (*sqrNorm), 0, ippCmpLess);
    ippiMulC_32f_C1R(pDst, dStep * sizeof(Ipp32f), *sqrNorm, pDst, dStep * sizeof(Ipp32f), dstSize);
    ippiSqrt_32f_C1R(pDst, dStep * sizeof(Ipp32f), pDst, dStep * sizeof(Ipp32f), dstSize);
}

void owniDenominatorFull_C1R(const Ipp32f *pSrc, int sStep, IppiSize tpl, Ipp32f *pDst, int dStep, IppiSize dstSize, Ipp64f *buf, int bStep,
                             Ipp32f *thresh, Ipp32f *sqrNorm)
{
    int i, j, n, f, l; /* f-irst, l-ast */
    Ipp64f *bufSum = buf;
    Ipp64f *bufSqr = buf + bStep;
    int xwidth = dstSize.width - tpl.width + 1;
    int xheight = dstSize.height - tpl.height + 1;
    Ipp64f Sum, Sqr;
    Ipp64f inverseSize = 1.0 / (tpl.width * tpl.height);

    /* The first line - special processing */
    Sum = Sqr = 0;
    for (j = 0; j < tpl.height; j++) {
        f = j * sStep;
        for (i = 0; i < tpl.width; i++) {
            Sum += ((Ipp64f)pSrc[i + f]);    /* The first point sum */
            Sqr += SQR((Ipp64f)pSrc[i + f]); /* The first point sqr sum */
        }
    }
    bufSum[0] = Sum;
    bufSqr[0] = Sqr;
    /* Then for the whole line - algorithm of the "running sum" */
    for (n = 1; n < xwidth; n++) {
        for (j = 0; j < tpl.height; j++) {
            f = n - 1 + j * sStep; /* index of the current first element */
            l = f + tpl.width;     /* index of the current last element */
            Sum += ((Ipp64f)pSrc[l] - (Ipp64f)pSrc[f]);
            Sqr += (SQR((Ipp64f)pSrc[l]) - SQR((Ipp64f)pSrc[f]));
        }
        bufSum[n] = Sum;
        bufSqr[n] = Sqr;
    }
    for (n = xwidth; n < dstSize.width; n++) {
        for (j = 0; j < tpl.height; j++) {
            f = n - 1 + j * sStep; /* index of the current first element */
            Sum -= ((Ipp64f)pSrc[f]);
            Sqr -= SQR((Ipp64f)pSrc[f]);
        }
        bufSum[n] = Sum;
        bufSqr[n] = Sqr;
    }
    for (n = 0; n < dstSize.width; n++) {
        pDst[n] = (Ipp32f)(bufSqr[n] - SQR((Ipp64f)bufSum[n]) * inverseSize);
    }

    /* All other lines - algorithm of the "running sum" based on the first line result */
    for (n = 1; n < xheight; n++) {
        j = n * dStep;
        f = (n - 1) * sStep;
        l = (n - 1 + tpl.height) * sStep;
        Sum = Sqr = 0;
        for (i = 0; i < tpl.width; i++) {
            Sum += ((Ipp64f)pSrc[i + l] - (Ipp64f)pSrc[i + f]);           /* The first portion sum update*/
            Sqr += (SQR((Ipp64f)pSrc[i + l]) - SQR((Ipp64f)pSrc[i + f])); /* The first portion sqr sum update*/
        }
        bufSum[0] += Sum;
        bufSqr[0] += Sqr;
        pDst[j] = (Ipp32f)(bufSqr[0] - SQR(bufSum[0]) * inverseSize);
        for (i = 1; i < xwidth; i++) {
            Sum += ((Ipp64f)pSrc[l + i + tpl.width - 1] - (Ipp64f)pSrc[i + l - 1] - (Ipp64f)pSrc[f + i + tpl.width - 1] + (Ipp64f)pSrc[i + f - 1]);
            Sqr += (SQR((Ipp64f)pSrc[l + i + tpl.width - 1]) - SQR((Ipp64f)pSrc[i + l - 1]) - SQR((Ipp64f)pSrc[f + i + tpl.width - 1]) +
                    SQR((Ipp64f)pSrc[i + f - 1]));
            bufSum[i] += Sum;
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)(bufSqr[i] - SQR(bufSum[i]) * inverseSize);
        }
        for (i = xwidth; i < dstSize.width; i++) {
            Sum += (-(Ipp64f)pSrc[i + l - 1] + (Ipp64f)pSrc[i + f - 1]);
            Sqr += (-SQR((Ipp64f)pSrc[i + l - 1]) + SQR((Ipp64f)pSrc[i + f - 1]));
            bufSum[i] += Sum;
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)(bufSqr[i] - SQR(bufSum[i]) * inverseSize);
        }
    }
    for (n = xheight; n < dstSize.height; n++) {
        j = n * dStep;
        f = (n - 1) * sStep;
        Sum = Sqr = 0;
        for (i = 0; i < tpl.width; i++) {
            Sum -= ((Ipp64f)pSrc[i + f]);    /* The first portion sum update*/
            Sqr -= SQR((Ipp64f)pSrc[i + f]); /* The first portion sqr sum update*/
        }
        bufSum[0] += Sum;
        bufSqr[0] += Sqr;
        pDst[j] = (Ipp32f)(bufSqr[0] - SQR(bufSum[0]) * inverseSize);
        for (i = 1; i < xwidth; i++) {
            Sum += (-(Ipp64f)pSrc[f + i + tpl.width - 1] + (Ipp64f)pSrc[i + f - 1]);
            Sqr += (-SQR((Ipp64f)pSrc[f + i + tpl.width - 1]) + SQR((Ipp64f)pSrc[i + f - 1]));
            bufSum[i] += Sum;
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)(bufSqr[i] - SQR(bufSum[i]) * inverseSize);
        }
        for (i = xwidth; i < dstSize.width; i++) {
            Sum += ((Ipp64f)pSrc[i + f - 1]);
            Sqr += SQR((Ipp64f)pSrc[i + f - 1]);
            bufSum[i] += Sum;
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)(bufSqr[i] - SQR(bufSum[i]) * inverseSize);
        }
    }
    ippiThreshold_Val_32f_C1R(pDst, dStep * sizeof(Ipp32f), pDst, dStep * sizeof(Ipp32f), dstSize, (*thresh) * (*sqrNorm), 0, ippCmpLess);
    ippiMulC_32f_C1R(pDst, dStep * sizeof(Ipp32f), *sqrNorm, pDst, dStep * sizeof(Ipp32f), dstSize);
    ippiSqrt_32f_C1R(pDst, dStep * sizeof(Ipp32f), pDst, dStep * sizeof(Ipp32f), dstSize);
}

void owniAutoCorr_C1R(const Ipp32f *pSrc, int sStep, IppiSize tpl, Ipp32f *pDst, int dStep, IppiSize dstSize, Ipp64f *bufSqr, Ipp32f *thresh,
                      Ipp32f *mpy)
{
    int i, j, n, f, l; /* f-irst, l-ast */
    Ipp64f Sqr;

    /* The first line - special processing */
    Sqr = 0;
    for (j = 0; j < tpl.height; j++) {
        f = j * sStep;
        for (i = 0; i < tpl.width; i++) {
            Sqr += SQR((Ipp64f)pSrc[i + f]); /* The first point sqr sum */
        }
    }
    bufSqr[0] = Sqr;
    pDst[0] = (Ipp32f)Sqr;
    /* Then for the whole line - algorithm of the "running sum" */
    for (n = 1; n < dstSize.width; n++) {
        for (j = 0; j < tpl.height; j++) {
            f = n - 1 + j * sStep; /* index of the current first element */
            l = f + tpl.width;     /* index of the current last element */
            Sqr += (SQR((Ipp64f)pSrc[l]) - SQR((Ipp64f)pSrc[f]));
        }
        bufSqr[n] = Sqr;
        pDst[n] = (Ipp32f)Sqr;
    }

    /* All other lines - algorithm of the "running sum" based on the first line result */
    for (n = 1; n < dstSize.height; n++) {
        j = n * dStep;
        f = (n - 1) * sStep;
        l = (n - 1 + tpl.height) * sStep;
        Sqr = 0;
        for (i = 0; i < tpl.width; i++) {
            Sqr += (SQR((Ipp64f)pSrc[i + l]) - SQR((Ipp64f)pSrc[i + f])); /* The first portion sqr sum update*/
        }
        bufSqr[0] += Sqr;
        pDst[j] = (Ipp32f)bufSqr[0];
        for (i = 1; i < dstSize.width; i++) {
            Sqr += (SQR((Ipp64f)pSrc[l + i + tpl.width - 1]) - SQR((Ipp64f)pSrc[i + l - 1]) - SQR((Ipp64f)pSrc[f + i + tpl.width - 1]) +
                    SQR((Ipp64f)pSrc[i + f - 1]));
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)bufSqr[i];
        }
    }
    ippiThreshold_Val_32f_C1R(pDst, dStep * sizeof(Ipp32f), pDst, dStep * sizeof(Ipp32f), dstSize, (*thresh), 0, ippCmpLess);
    ippiSqrt_32f_C1R(pDst, dStep * sizeof(Ipp32f), pDst, dStep * sizeof(Ipp32f), dstSize);
    ippiMulC_32f_C1R(pDst, dStep * sizeof(Ipp32f), *mpy, pDst, dStep * sizeof(Ipp32f), dstSize);
}

void owniAutoCorrFull_C1R(const Ipp32f *pSrc, int sStep, IppiSize tpl, Ipp32f *pDst, int dStep, IppiSize dstSize, Ipp64f *bufSqr, Ipp32f *thresh,
                          Ipp32f *mpy)
{
    int i, j, n, f, l; /* f-irst, l-ast */
    int xwidth = dstSize.width - tpl.width + 1;
    int xheight = dstSize.height - tpl.height + 1;
    Ipp64f Sqr;

    /* The first line - special processing */
    Sqr = 0;
    for (j = 0; j < tpl.height; j++) {
        f = j * sStep;
        for (i = 0; i < tpl.width; i++) {
            Sqr += SQR((Ipp64f)pSrc[i + f]); /* The first point sqr sum */
        }
    }
    bufSqr[0] = Sqr;
    pDst[0] = (Ipp32f)Sqr;
    /* Then for the whole line - algorithm of the "running sum" */
    for (n = 1; n < xwidth; n++) {
        for (j = 0; j < tpl.height; j++) {
            f = n - 1 + j * sStep; /* index of the current first element */
            l = f + tpl.width;     /* index of the current last element */
            Sqr += (SQR((Ipp64f)pSrc[l]) - SQR((Ipp64f)pSrc[f]));
        }
        bufSqr[n] = Sqr;
        pDst[n] = (Ipp32f)Sqr;
    }
    for (n = xwidth; n < dstSize.width; n++) {
        for (j = 0; j < tpl.height; j++) {
            f = n - 1 + j * sStep; /* index of the current first element */
            Sqr -= SQR((Ipp64f)pSrc[f]);
        }
        bufSqr[n] = Sqr;
        pDst[n] = (Ipp32f)Sqr;
    }

    /* All other lines - algorithm of the "running sum" based on the first line result */
    for (n = 1; n < xheight; n++) {
        j = n * dStep;
        f = (n - 1) * sStep;
        l = (n - 1 + tpl.height) * sStep;
        Sqr = 0;
        for (i = 0; i < tpl.width; i++) {
            Sqr += (SQR((Ipp64f)pSrc[i + l]) - SQR((Ipp64f)pSrc[i + f])); /* The first portion sqr sum update*/
        }
        bufSqr[0] += Sqr;
        pDst[j] = (Ipp32f)bufSqr[0];
        for (i = 1; i < xwidth; i++) {
            Sqr += (SQR((Ipp64f)pSrc[l + i + tpl.width - 1]) - SQR((Ipp64f)pSrc[i + l - 1]) - SQR((Ipp64f)pSrc[f + i + tpl.width - 1]) +
                    SQR((Ipp64f)pSrc[i + f - 1]));
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)bufSqr[i];
        }
        for (i = xwidth; i < dstSize.width; i++) {
            Sqr += (-SQR((Ipp64f)pSrc[i + l - 1]) + SQR((Ipp64f)pSrc[i + f - 1]));
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)bufSqr[i];
        }
    }
    for (n = xheight; n < dstSize.height; n++) {
        j = n * dStep;
        f = (n - 1) * sStep;
        Sqr = 0;
        for (i = 0; i < tpl.width; i++) {
            Sqr -= SQR((Ipp64f)pSrc[i + f]); /* The first portion sqr sum update*/
        }
        bufSqr[0] += Sqr;
        pDst[j] = (Ipp32f)bufSqr[0];
        for (i = 1; i < xwidth; i++) {
            Sqr += (-SQR((Ipp64f)pSrc[f + i + tpl.width - 1]) + SQR((Ipp64f)pSrc[i + f - 1]));
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)bufSqr[i];
        }
        for (i = xwidth; i < dstSize.width; i++) {
            Sqr += SQR((Ipp64f)pSrc[i + f - 1]);
            bufSqr[i] += Sqr;
            pDst[i + j] = (Ipp32f)bufSqr[i];
        }
    }
    ippiThreshold_Val_32f_C1R(pDst, dStep * sizeof(Ipp32f), pDst, dStep * sizeof(Ipp32f), dstSize, (*thresh), 0, ippCmpLess);
    ippiSqrt_32f_C1R(pDst, dStep * sizeof(Ipp32f), pDst, dStep * sizeof(Ipp32f), dstSize);
    ippiMulC_32f_C1R(pDst, dStep * sizeof(Ipp32f), *mpy, pDst, dStep * sizeof(Ipp32f), dstSize);
}

// Ignore floating-point exceptions that occur because of optimizations.
// Look at https://github.com/llvm/llvm-project/issues/77492 for more details.
#pragma float_control(precise, on)
#pragma STDC FENV_ACCESS ON
IppStatus ownsCCDiv_32f_I(const Ipp32f *pSrc, Ipp32f *pSrcDst, int len)
{
    IppStatus result = ippStsNoErr;
    {
        int i;

        for (i = 0; i < len; i++) {
            if (pSrc[i] != 0.0f) {
                pSrcDst[i] = pSrcDst[i] / pSrc[i];
            } else {
                result = ippStsDivByZero;
                pSrcDst[i] = 0;
            }
        }
    }
    return result;
}
#pragma STDC FENV_ACCESS OFF
#pragma float_control(precise, off)

IppStatus owniCCDiv_32f_C1IR(const Ipp32f *pSrc, int srcStep, Ipp32f *pSrcDst, int srcDstStep, IppiSize roiSize)
{
    int i;
    int width = roiSize.width;
    int height = roiSize.height;
    IppStatus result = ippStsNoErr;
    IppStatus r;

    for (i = 0; i < height; i++) {
        r = ownsCCDiv_32f_I(pSrc, pSrcDst, width);
        if (r != ippStsNoErr) {
            if (result == ippStsNoErr)
                result = r;
        }
        pSrc = (Ipp32f *)((Ipp8u *)pSrc + srcStep);
        pSrcDst = (Ipp32f *)((Ipp8u *)pSrcDst + srcDstStep);
    }
    return result;
}

/* //////// Assistant functions for zero-padded FFT buffers ///////// */

void owniClipRectZeroTail_32f_C1R(const Ipp32f *pRect, int srcStep, IppiSize srcRect, Ipp32f *pFftBuf, IppiSize fftSize)
{
    int h = srcRect.height;
    int wF = fftSize.width;
    int hF = fftSize.height;
    Ipp32f *pTmp;
    IppiSize zeroSize;

    pTmp = pFftBuf + srcRect.width;
    zeroSize.width = wF - srcRect.width;
    zeroSize.height = h;
    ippiCopy_32f_C1R(pRect, srcStep, pFftBuf, wF * sizeof(Ipp32f), srcRect);
    if (zeroSize.width > 0)
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    if (h < hF) {
        pTmp = pFftBuf + wF * h;
        ippsZero_32f(pTmp, (hF - h) * wF);
    }
}

void owniClipRectZeroTail_8u32f_C1R(const Ipp8u *pRect, int srcStep, IppiSize srcRect, Ipp32f *pFftBuf, IppiSize fftSize)
{
    int h = srcRect.height;
    int wF = fftSize.width;
    int hF = fftSize.height;
    Ipp32f *pTmp;
    IppiSize zeroSize;

    pTmp = pFftBuf + srcRect.width;
    zeroSize.width = wF - srcRect.width;
    zeroSize.height = h;
    ippiConvert_8u32f_C1R(pRect, srcStep, pFftBuf, wF * sizeof(Ipp32f), srcRect);
    if (zeroSize.width > 0)
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    if (h < hF) {
        pTmp = pFftBuf + wF * h;
        ippsZero_32f(pTmp, (hF - h) * wF);
    }
}

void owniClipRectZeroTail_16u32f_C1R(const Ipp16u *pRect, int srcStep, IppiSize srcRect, Ipp32f *pFftBuf, IppiSize fftSize)
{
    int h = srcRect.height;
    int wF = fftSize.width;
    int hF = fftSize.height;
    Ipp32f *pTmp;
    IppiSize zeroSize;

    pTmp = pFftBuf + srcRect.width;
    zeroSize.width = wF - srcRect.width;
    zeroSize.height = h;
    ippiConvert_16u32f_C1R(pRect, srcStep, pFftBuf, wF * sizeof(Ipp32f), srcRect);
    if (zeroSize.width > 0)
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    if (h < hF) {
        pTmp = pFftBuf + wF * h;
        ippsZero_32f(pTmp, (hF - h) * wF);
    }
}

void owniShiftClipRectZeroTail_32f_C1R(const Ipp32f *pRect, int srcStep, IppiSize srcRect, Ipp32f *pFftBuf, IppiSize fftSize, IppiSize shiftSize)
{
    int w, h;
    int wF = fftSize.width;
    int hF = fftSize.height;
    int wS = shiftSize.width;
    int hS = shiftSize.height;

    Ipp32f *pTmp;
    IppiSize zeroSize;

    w = srcRect.width = IPP_MIN(srcRect.width, (wF - wS));
    h = srcRect.height = IPP_MIN(srcRect.height, (hF - hS));
    if (hS > 0) {
        ippsZero_32f(pFftBuf, wF * hS);
    }
    if (wS > 0) {
        zeroSize.width = wS;
        zeroSize.height = hF - hS;
        pTmp = pFftBuf + wF * hS;
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    }
    pTmp = pFftBuf + wS + wF * hS;
    zeroSize.width = wF - w - wS;
    zeroSize.height = h;
    ippiCopy_32f_C1R(pRect, srcStep, pTmp, wF * sizeof(Ipp32f), srcRect);
    if (zeroSize.width > 0) {
        pTmp += w;
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    }
    if (h < (hF - hS)) {
        pTmp = pFftBuf + wF * (h + hS);
        ippsZero_32f(pTmp, (hF - h - hS) * wF);
    }
}

void owniShiftClipRectZeroTail_8u32f_C1R(const Ipp8u *pRect, int srcStep, IppiSize srcRect, Ipp32f *pFftBuf, IppiSize fftSize, IppiSize shiftSize)
{
    int w, h;
    int wF = fftSize.width;
    int hF = fftSize.height;
    int wS = shiftSize.width;
    int hS = shiftSize.height;

    Ipp32f *pTmp;
    IppiSize zeroSize;

    w = srcRect.width = IPP_MIN(srcRect.width, (wF - wS));
    h = srcRect.height = IPP_MIN(srcRect.height, (hF - hS));
    if (hS > 0) {
        ippsZero_32f(pFftBuf, wF * hS);
    }
    if (wS > 0) {
        zeroSize.width = wS;
        zeroSize.height = hF - hS;
        pTmp = pFftBuf + wF * hS;
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    }
    pTmp = pFftBuf + wS + wF * hS;
    zeroSize.width = wF - w - wS;
    zeroSize.height = h;
    ippiConvert_8u32f_C1R(pRect, srcStep, pTmp, wF * sizeof(Ipp32f), srcRect);
    if (zeroSize.width > 0) {
        pTmp += w;
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    }
    if (h < (hF - hS)) {
        pTmp = pFftBuf + wF * (h + hS);
        ippsZero_32f(pTmp, (hF - h - hS) * wF);
    }
}

void owniShiftClipRectZeroTail_16u32f_C1R(const Ipp16u *pRect, int srcStep, IppiSize srcRect, Ipp32f *pFftBuf, IppiSize fftSize, IppiSize shiftSize)
{
    int w, h;
    int wF = fftSize.width;
    int hF = fftSize.height;
    int wS = shiftSize.width;
    int hS = shiftSize.height;

    Ipp32f *pTmp;
    IppiSize zeroSize;

    w = srcRect.width = IPP_MIN(srcRect.width, (wF - wS));
    h = srcRect.height = IPP_MIN(srcRect.height, (hF - hS));
    if (hS > 0) {
        ippsZero_32f(pFftBuf, wF * hS);
    }
    if (wS > 0) {
        zeroSize.width = wS;
        zeroSize.height = hF - hS;
        pTmp = pFftBuf + wF * hS;
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    }
    pTmp = pFftBuf + wS + wF * hS;
    zeroSize.width = wF - w - wS;
    zeroSize.height = h;
    ippiConvert_16u32f_C1R(pRect, srcStep, pTmp, wF * sizeof(Ipp32f), srcRect);
    if (zeroSize.width > 0) {
        pTmp += w;
        ippiSet_32f_C1R(0, pTmp, wF * sizeof(Ipp32f), zeroSize);
    }
    if (h < (hF - hS)) {
        pTmp = pFftBuf + wF * (h + hS);
        ippsZero_32f(pTmp, (hF - h - hS) * wF);
    }
}

/* ////////    Complex conjugate image in RCPack2D format   ///////// */

void owniRCPack2DConj_32f_C1IR(Ipp32f *pSrcDst, int step, IppiSize roiSize)
{
    int width = roiSize.width;
    int height = roiSize.height;
    int w, h;
    Ipp32f *pTmp;

    for (h = 2; h < height; h += 2) {
        pTmp = (Ipp32f *)((Ipp8u *)pSrcDst + h * step);
        pTmp[0] = -pTmp[0];
        if (!(width & 1)) {
            pTmp[width - 1] = -pTmp[width - 1];
        }
    }
    pTmp = (Ipp32f *)(pSrcDst);
    for (h = height; h; h--) {
        for (w = 2; w < width; w += 2) {
            pTmp[w] = -pTmp[w];
        }
        pTmp = (Ipp32f *)((Ipp8u *)pTmp + step);
    }
}

/* ======================================================================== */
/* ----- Forward transforms ----------------------------------------------- */
/* ======================================================================== */

/*F*
//  Name:       owniFFTFwd_RToPack_32f_C1R
//  Purpose:    compute forward FFT of the image
//  Arguments:
//     pFFTSpec - pointer to FFT context
//     pSrc     - pointer to source image
//     srcStep  - the step in Src image
//     pDst     - pointer to destination image
//     dstStep  - the step in Dst image
//     pBuffer  - pointer to work buffer
//  Return:     status
*F*/

IppStatus owniFFTFwd_RToPack_32f_C1R(const Ipp32f *pSrc, int srcStep, Ipp32f *pDst, int dstStep, const IppiFFTSpec_R_32f *pFFTSpec, Ipp8u *pBuffer,
                                     int start, int end)
{
    IppiFFTContext_32f *ctxFFT = (IppiFFTContext_32f *)IPP_ALIGNED_PTR(pFFTSpec, IPP_MALLOC_ALIGNED_BYTES);
    IppsFFTSpec_R_32f *ctx_R_X, *ctx_R_Y;
    IppsFFTSpec_C_32fc *ctx_C_Y;
    IppStatus res;
    int lenX, lenY;
    Ipp8u *buf;
    Ipp32f *b0, *b1, *b2, *b3;
    Ipp32f *curSrc, *curDst;
    int i, j;

    buf = (Ipp8u *)IPP_ALIGNED_PTR(pBuffer, IPP_MALLOC_ALIGNED_BYTES);
    b0 = (Ipp32f *)buf;

    lenX = (1 << ctxFFT->orderX);
    lenY = (1 << ctxFFT->orderY);
    if (end > lenY)
        end = lenY;
    ctx_R_X = (IppsFFTSpec_R_32f *)ctxFFT->ctxFFT_R_X;
    ctx_R_Y = (IppsFFTSpec_R_32f *)ctxFFT->ctxFFT_R_Y;
    ctx_C_Y = (IppsFFTSpec_C_32fc *)ctxFFT->ctxFFT_C_Y;
    if (ctx_R_Y == NULL)
        ctx_R_Y = ctx_R_X;

    if (lenY == 1) {
        res = ippsFFTFwd_RToPack_32f(pSrc, pDst, ctx_R_X, buf);
        return res;
    }
    if (lenX == 1) {
        if ((srcStep == sizeof(Ipp32f)) && (dstStep == sizeof(Ipp32f))) {
            res = ippsFFTFwd_RToPack_32f(pSrc, pDst, ctx_R_Y, buf);
            return res;
        }
        curSrc = (Ipp32f *)pSrc;
        for (j = 0; j < lenY; j++) {
            b0[j] = curSrc[0];
            curSrc = (Ipp32f *)((char *)curSrc + srcStep);
        }
        res = ippsFFTFwd_RToPack_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
        if (res != ippStsNoErr)
            return res;
        curDst = pDst;
        for (j = 0; j < lenY; j++) {
            curDst[0] = b0[j];
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
        return ippStsNoErr;
    }
    curSrc = (Ipp32f *)((char *)pSrc + start * srcStep);
    curDst = (Ipp32f *)((char *)pDst + start * dstStep);
    for (j = start; j < end; j++) {
        res = ippsFFTFwd_RToPack_32f(curSrc, curDst, ctx_R_X, buf);
        if (res != ippStsNoErr)
            return res;
        curSrc = (Ipp32f *)((char *)curSrc + srcStep);
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }
    curDst = pDst;
    for (j = 0; j < start; j++) {
        b0[j] = 0;
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }
    for (j = start; j < end; j++) {
        b0[j] = curDst[0];
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }
    for (j = end; j < lenY; j++) {
        b0[j] = 0;
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }
    res = ippsFFTFwd_RToPack_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
    if (res != ippStsNoErr)
        return res;
    curDst = pDst;
    for (j = 0; j < lenY; j++) {
        curDst[0] = b0[j];
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }

    b1 = b0 + 2 * lenY;
    b2 = b1 + 2 * lenY;
    b3 = b2 + 2 * lenY;
    for (i = 1; i < ((lenX - 1) & ~7) + 1; i += 8) {
        curDst = pDst + i;
        for (j = 0; j < start; j++) {
            b0[2 * j] = 0;
            b0[2 * j + 1] = 0;
            b1[2 * j] = 0;
            b1[2 * j + 1] = 0;
            b2[2 * j] = 0;
            b2[2 * j + 1] = 0;
            b3[2 * j] = 0;
            b3[2 * j + 1] = 0;
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
        for (j = start; j < end; j++) {
            b0[2 * j] = curDst[0];
            b0[2 * j + 1] = curDst[1];
            b1[2 * j] = curDst[2];
            b1[2 * j + 1] = curDst[3];
            b2[2 * j] = curDst[4];
            b2[2 * j + 1] = curDst[5];
            b3[2 * j] = curDst[6];
            b3[2 * j + 1] = curDst[7];
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
        for (j = end; j < lenY; j++) {
            b0[2 * j] = 0;
            b0[2 * j + 1] = 0;
            b1[2 * j] = 0;
            b1[2 * j + 1] = 0;
            b2[2 * j] = 0;
            b2[2 * j + 1] = 0;
            b3[2 * j] = 0;
            b3[2 * j + 1] = 0;
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
        res = ippsFFTFwd_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        if (res != ippStsNoErr)
            return res;
        res = ippsFFTFwd_CToC_32fc((Ipp32fc *)b1, (Ipp32fc *)b1, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        if (res != ippStsNoErr)
            return res;
        res = ippsFFTFwd_CToC_32fc((Ipp32fc *)b2, (Ipp32fc *)b2, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        if (res != ippStsNoErr)
            return res;
        res = ippsFFTFwd_CToC_32fc((Ipp32fc *)b3, (Ipp32fc *)b3, ctx_C_Y, (Ipp8u *)(b0 + 8 * lenY));
        if (res != ippStsNoErr)
            return res;
        curDst = pDst + i;
        for (j = 0; j < lenY; j++) {
            curDst[0] = b0[2 * j];
            curDst[1] = b0[2 * j + 1];
            curDst[2] = b1[2 * j];
            curDst[3] = b1[2 * j + 1];
            curDst[4] = b2[2 * j];
            curDst[5] = b2[2 * j + 1];
            curDst[6] = b3[2 * j];
            curDst[7] = b3[2 * j + 1];
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
    }

    for (i = ((lenX - 1) & ~7) + 1; i < lenX - 1; i += 2) {
        curDst = pDst + i;
        for (j = 0; j < start; j++) {
            b0[2 * j] = 0;
            b0[2 * j + 1] = 0;
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
        for (j = start; j < end; j++) {
            b0[2 * j] = curDst[0];
            b0[2 * j + 1] = curDst[1];
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
        for (j = end; j < lenY; j++) {
            b0[2 * j] = 0;
            b0[2 * j + 1] = 0;
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
        res = ippsFFTFwd_CToC_32fc((Ipp32fc *)b0, (Ipp32fc *)b0, ctx_C_Y, (Ipp8u *)(b0 + 2 * lenY));
        if (res != ippStsNoErr)
            return res;
        curDst = pDst + i;
        for (j = 0; j < lenY; j++) {
            curDst[0] = b0[2 * j];
            curDst[1] = b0[2 * j + 1];
            curDst = (Ipp32f *)((char *)curDst + dstStep);
        }
    }

    curDst = pDst + lenX - 1;
    for (j = 0; j < start; j++) {
        b0[j] = 0;
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }
    for (j = start; j < end; j++) {
        b0[j] = curDst[0];
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }
    for (j = end; j < lenY; j++) {
        b0[j] = 0;
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }
    res = ippsFFTFwd_RToPack_32f(b0, b0, ctx_R_Y, (Ipp8u *)(b0 + lenY));
    if (res != ippStsNoErr)
        return res;
    curDst = pDst + lenX - 1;
    for (j = 0; j < lenY; j++) {
        curDst[0] = b0[j];
        curDst = (Ipp32f *)((char *)curDst + dstStep);
    }

    return ippStsNoErr;
}

static void CrossCorrNormThreadingStructureEncode(
    OwniCrossCorrConfig *pCfg, IppStatus *stsBuf, int nThreads, int nFrames, int w, int onceW, int onceH, int shftW, int shftH, Ipp32f *pMemBuf,
    Ipp32f *tplFft, int stsBufSize, int thrMemSize, IppiSize dstSize, IppiSize srcRoiSize, IppiSize tplRoiSize, IppiSize fftS, int srcDataSize,
    int dstDataSize, const void *pSrc, void *pDst, Ipp32f tplNorm, Ipp32f sqrNorm, Ipp32f thresh, int srcStep, int aStep, int fStep, int dstStep,
    int scaleFactor, IppiFFTSpec_R_32f *ctxFft, void (*pAuto)(const Ipp32f *, int, IppiSize, Ipp32f *, int, IppiSize, Ipp64f *, Ipp32f *, Ipp32f *),
    void (*pDeno)(const Ipp32f *, int, IppiSize, Ipp32f *, int, IppiSize, Ipp64f *, int, Ipp32f *, Ipp32f *), IppiNormOp normOp,
    ippiCrossCorrNorm_T_Str *ts)
{
    ts->stsBuf = stsBuf;
    ts->nThreads = nThreads;
    ts->nFrames = nFrames;
    ts->w = w;
    ts->onceW = onceW;
    ts->onceH = onceH;
    ts->shftW = shftW;
    ts->shftH = shftH;
    ts->pMemBuf = pMemBuf;
    ts->tplFft = tplFft;
    ts->sizeWrkBuf = pCfg->sizeWrkBuf;
    ts->sizeA = pCfg->sizeA;
    ts->stsBufSize = stsBufSize;
    ts->thrMemSize = thrMemSize;
    ts->srcDataType = pCfg->srcDataType;
    ts->dstDataType = pCfg->dstDataType;
    ts->dstSize = dstSize;
    ts->srcRoiSize = srcRoiSize;
    ts->tplRoiSize = tplRoiSize;
    ts->fftS = fftS;
    ts->srcDataSize = srcDataSize;
    ts->dstDataSize = dstDataSize;
    ts->pSrc = (void *)pSrc;
    ts->pDst = pDst;
    ts->tplNorm = tplNorm;
    ts->sqrNorm = sqrNorm;
    ts->thresh = thresh;
    ts->srcStep = srcStep;
    ts->aStep = aStep;
    ts->fStep = fStep;
    ts->dstStep = dstStep;
    ts->scaleFactor = scaleFactor;
    ts->ctxFft = ctxFft;
    ts->pAuto = pAuto;
    ts->pDeno = pDeno;
    ts->normOp = normOp;
}

IppStatus ippiCrossCorrNorm_FullSame_32f_C1R_T_Fun(int t, void *arg)
{
    ippiCrossCorrNorm_T_Str *ts = (ippiCrossCorrNorm_T_Str *)arg;
    int id;
    int curW, curH;

    IppStatus status = ippStsNoErr;
    Ipp32f *frameFft, *aBuf;
    Ipp64f *fBuf;
    Ipp8u *pBufFft;
    void *curSrc, *curDst;
    IppiSize junk, shiftS, frame;

    ippGetThreadIdx_T(&id);

    frameFft = ts->pMemBuf + (ts->sizeWrkBuf / 2) + ts->stsBufSize + id * ts->thrMemSize;
    aBuf = frameFft + (ts->sizeWrkBuf / 2);
    pBufFft = (Ipp8u *)(aBuf + ts->sizeA);
    fBuf = (Ipp64f *)pBufFft;
    ts->stsBuf[id + 1] = ippStsNoErr;

    curH = (t / ts->w) * ts->onceH;
    curW = (t % ts->w) * ts->onceW;
    frame.height = IPP_MIN(ts->onceH, (ts->dstSize.height - curH));
    frame.width = IPP_MIN(ts->onceW, (ts->dstSize.width - curW));
    junk.width = IPP_MIN(ts->srcRoiSize.width, ts->srcRoiSize.width - curW + ts->shftW);
    junk.height = IPP_MIN(ts->srcRoiSize.height, ts->srcRoiSize.height - curH + ts->shftH);
    junk.width = IPP_MIN(ts->fftS.width, junk.width);
    junk.height = IPP_MIN(ts->fftS.height, junk.height);
    if (curH == 0) {
        if (curW == 0) {
            curSrc = (void *)ts->pSrc;
            shiftS.width = ts->shftW;
            shiftS.height = ts->shftH;
        } else {
            curSrc = (void *)((Ipp8u *)ts->pSrc + (curW - ts->shftW) * ts->srcDataSize);
            shiftS.width = 0;
            shiftS.height = ts->shftH;
        }
    } else {
        if (curW == 0) {
            curSrc = (void *)((Ipp8u *)ts->pSrc + (curH - ts->shftH) * ts->srcStep);
            shiftS.width = ts->shftW;
            shiftS.height = 0;
        } else {
            curSrc = (void *)((Ipp8u *)ts->pSrc + (curW - ts->shftW) * ts->srcDataSize + (curH - ts->shftH) * ts->srcStep);
            shiftS.width = shiftS.height = 0;
        }
    }

    switch (ts->srcDataType) {
    case ipp32f:
        owniShiftClipRectZeroTail_32f_C1R((Ipp32f *)curSrc, ts->srcStep, junk, frameFft, ts->fftS, shiftS);
        break;
    case ipp16u:
        owniShiftClipRectZeroTail_16u32f_C1R((Ipp16u *)curSrc, ts->srcStep, junk, frameFft, ts->fftS, shiftS);
        break;
    case ipp8u:
        owniShiftClipRectZeroTail_8u32f_C1R((Ipp8u *)curSrc, ts->srcStep, junk, frameFft, ts->fftS, shiftS);
        break;
    }

    switch (ts->normOp) {
    case ippiNorm:
        (*ts->pAuto)(frameFft, ts->fftS.width, ts->tplRoiSize, aBuf, ts->onceW, frame, fBuf, &ts->thresh, &ts->tplNorm);
        break;
    case ippiNormCoefficient:
        (*ts->pDeno)(frameFft, ts->fftS.width, ts->tplRoiSize, aBuf, ts->onceW, frame, fBuf, ts->onceW, &ts->thresh, &ts->sqrNorm);
        break;
    default:
        break; // ippiNormNone
    }

    status = owniFFTFwd_RToPack_32f_C1R(frameFft, ts->fStep, frameFft, ts->fStep, ts->ctxFft, pBufFft, shiftS.height, shiftS.height + junk.height);
    if (ts->stsBuf[id + 1] > status)
        ts->stsBuf[id + 1] = status;
    ippiMulPack_32f_C1R(ts->tplFft, ts->fStep, frameFft, ts->fStep, frameFft, ts->fStep, ts->fftS);
    status = ippiFFTInv_PackToR_32f_C1R(frameFft, ts->fStep, frameFft, ts->fStep, ts->ctxFft, pBufFft);
    if (ts->stsBuf[id + 1] > status)
        ts->stsBuf[id + 1] = status;

    switch (ts->normOp) {
    case ippiNorm:
    case ippiNormCoefficient:
        owniCCDiv_32f_C1IR(aBuf, ts->aStep, frameFft, ts->fStep, frame);
        break;
    default:
        break; // ippiNormNone
    }

    curDst = (void *)((Ipp8u *)ts->pDst + curW * ts->dstDataSize + curH * ts->dstStep);
    switch (ts->dstDataType) {
    case ipp32f:
        ippiCopy_32f_C1R(frameFft, ts->fStep, (Ipp32f *)curDst, ts->dstStep, frame);
        break;
    case ipp8u:
        if (ts->normOp == ippiNormNone) {
            ippiConvert_32f8u_C1RSfs(frameFft, ts->fStep, (Ipp8u *)curDst, ts->dstStep, frame, ippRndNear, ts->scaleFactor);
        } else { // ippiNorm & ippiNormCoefficient
            ippiConvert_32f8u_C1R(frameFft, ts->fStep, (Ipp8u *)curDst, ts->dstStep, frame, ippRndNear);
        }
        break;
    }

    return status;
}

IppStatus ippiCrossCorrNorm_Valid_32f_C1R_T_Fun(int t, void *arg)
{
    ippiCrossCorrNorm_T_Str *ts = (ippiCrossCorrNorm_T_Str *)arg;
    int id;
    int curW, curH;

    IppStatus status = ippStsNoErr;
    Ipp32f *frameFft, *aBuf;
    Ipp64f *fBuf;
    Ipp8u *pBufFft;
    void *curSrc, *curDst;
    IppiSize junk, frame;

    ippGetThreadIdx_T(&id);

    frameFft = ts->pMemBuf + (ts->sizeWrkBuf / 2) + ts->stsBufSize + id * ts->thrMemSize;
    aBuf = frameFft + (ts->sizeWrkBuf / 2);
    pBufFft = (Ipp8u *)(aBuf + ts->sizeA);
    fBuf = (Ipp64f *)pBufFft;
    ts->stsBuf[id + 1] = ippStsNoErr;

    curH = (t / ts->w) * ts->onceH;
    curW = (t % ts->w) * ts->onceW;
    frame.height = IPP_MIN(ts->onceH, (ts->dstSize.height - curH));
    frame.width = IPP_MIN(ts->onceW, (ts->dstSize.width - curW));
    junk.width = IPP_MIN(ts->fftS.width, ts->srcRoiSize.width - curW);
    junk.height = IPP_MIN(ts->fftS.height, ts->srcRoiSize.height - curH);
    curSrc = (void *)((Ipp8u *)ts->pSrc + curW * ts->srcDataSize + curH * ts->srcStep);

    switch (ts->srcDataType) {
    case ipp32f:
        owniClipRectZeroTail_32f_C1R((Ipp32f *)curSrc, ts->srcStep, junk, frameFft, ts->fftS);
        break;
    case ipp16u:
        owniClipRectZeroTail_16u32f_C1R((Ipp16u *)curSrc, ts->srcStep, junk, frameFft, ts->fftS);
        break;
    case ipp8u:
        owniClipRectZeroTail_8u32f_C1R((Ipp8u *)curSrc, ts->srcStep, junk, frameFft, ts->fftS);
        break;
    }

    switch (ts->normOp) {
    case ippiNorm:
        (*ts->pAuto)(frameFft, ts->fftS.width, ts->tplRoiSize, aBuf, ts->onceW, frame, fBuf, &ts->thresh, &ts->tplNorm);
        break;
    case ippiNormCoefficient:
        (*ts->pDeno)(frameFft, ts->fftS.width, ts->tplRoiSize, aBuf, ts->onceW, frame, fBuf, ts->onceW, &ts->thresh, &ts->sqrNorm);
        break;
    default:
        break; // ippiNormNone
    }

    status = owniFFTFwd_RToPack_32f_C1R(frameFft, ts->fStep, frameFft, ts->fStep, ts->ctxFft, pBufFft, 0, junk.height);
    if (ts->stsBuf[id + 1] > status)
        ts->stsBuf[id + 1] = status;
    ippiMulPack_32f_C1R(ts->tplFft, ts->fStep, frameFft, ts->fStep, frameFft, ts->fStep, ts->fftS);
    status = ippiFFTInv_PackToR_32f_C1R(frameFft, ts->fStep, frameFft, ts->fStep, ts->ctxFft, pBufFft);
    if (ts->stsBuf[id + 1] > status)
        ts->stsBuf[id + 1] = status;

    switch (ts->normOp) {
    case ippiNorm:
    case ippiNormCoefficient:
        owniCCDiv_32f_C1IR(aBuf, ts->aStep, frameFft, ts->fStep, frame);
        break;
    default:
        break; // ippiNormNone
    }

    curDst = (void *)((Ipp8u *)ts->pDst + curW * ts->dstDataSize + curH * ts->dstStep);
    switch (ts->dstDataType) {
    case ipp32f:
        ippiCopy_32f_C1R(frameFft, ts->fStep, (Ipp32f *)curDst, ts->dstStep, frame);
        break;
    case ipp8u:
        if (ts->normOp == ippiNormNone) {
            ippiConvert_32f8u_C1RSfs(frameFft, ts->fStep, (Ipp8u *)curDst, ts->dstStep, frame, ippRndNear, ts->scaleFactor);
        } else { // ippiNorm & ippiNormCoefficient
            ippiConvert_32f8u_C1R(frameFft, ts->fStep, (Ipp8u *)curDst, ts->dstStep, frame, ippRndNear);
        }
        break;
    }

    return status;
}

IppStatus owniCrossCorrNorm_FullSame_32f_C1R_T(const void *pSrc, int srcStep, const void *pTpl, int tplStep, void *pDst, int dstStep, int scaleFactor,
                                               OwniCrossCorrConfig *pCfg, Ipp8u *pBuffer)
{
    const IppiROIShape shape = (IppiROIShape)(pCfg->algType & ippiROIMask);
    const IppiNormOp normOp = (IppiNormOp)(pCfg->algType & ippiNormMask);

    IppiSize srcRoiSize, tplRoiSize, dstSize;
    srcRoiSize = pCfg->srcRoiSize;
    tplRoiSize = pCfg->tplRoiSize;
    dstSize = pCfg->dstRoiSize;

    int srcDataSize, dstDataSize;
    Ipp32f mpy;

    switch (pCfg->srcDataType) {
    case ipp32f:
        srcDataSize = (int)sizeof(Ipp32f);
        break;
    case ipp16u:
        srcDataSize = (int)sizeof(Ipp16u);
        break;
    case ipp8u:
        srcDataSize = (int)sizeof(Ipp8u);
        break;
    default:
        return ippStsDataTypeErr;
    }

    switch (pCfg->dstDataType) {
    case ipp32f:
        dstDataSize = (int)sizeof(Ipp32f);
        mpy = 1.f;
        break;
    case ipp8u:
        dstDataSize = (int)sizeof(Ipp8u);
        mpy = get_mpy(-scaleFactor);
        break;
    default:
        return ippStsDataTypeErr;
    }

    int corr_block_alg = 0;
    switch (shape) {
    case ippiROIFull:
        corr_block_alg = _CORRFULL_BLOCK_ALG;
        break;
    default:
        corr_block_alg = _CORRSAME_BLOCK_ALG;
        break;
    }
    if (srcRoiSize.width * srcRoiSize.height > corr_block_alg * tplRoiSize.width * tplRoiSize.height) {
        IppStatus status = ippStsNoErr, *stsBuf = NULL;
        Ipp8u *my_pBufFft = NULL;
        Ipp32f *pMemBuf = NULL, thresh = THRESH_32F;
        Ipp32f *tplFft = NULL, tplNorm = 0, tplMean, sqrNorm = 0;
        IppiFFTSpec_R_32f *ctxFft;
        int fStep, aStep, shftW = 0, shftH = 0, onceW, onceH;
        Ipp64f tplNormL2, tplMean64;
        int w = 0, h, nFrames = 0, thrMemSize, stsBufSize, nThreads = 0;
        IppiSize fftS;
        void (*pAuto)(const Ipp32f *, int, IppiSize, Ipp32f *, int, IppiSize, Ipp64f *, Ipp32f *, Ipp32f *);
        void (*pDeno)(const Ipp32f *, int, IppiSize, Ipp32f *, int, IppiSize, Ipp64f *, int, Ipp32f *, Ipp32f *);

        fftS = pCfg->fftRoiSize;
        fStep = fftS.width * sizeof(Ipp32f);

        pAuto = owniAutoCorr_C1R;
        pDeno = owniDenominator_C1R;

        switch (shape) {
        case ippiROIFull:
            if ((fftS.width >= dstSize.width) && (fftS.height >= dstSize.height)) // special case for Full_shape
            {
                pAuto = owniAutoCorrFull_C1R;
                pDeno = owniDenominatorFull_C1R;
            }
            shftW = tplRoiSize.width - 1;
            shftH = tplRoiSize.height - 1;
            break;
        case ippiROISame:
            shftW = tplRoiSize.width >> 1;
            shftH = tplRoiSize.height >> 1;
            break;
        }

        onceH = pCfg->frameRoiSize.height;
        onceW = pCfg->frameRoiSize.width;
        aStep = pCfg->frameRoiSize.width * sizeof(Ipp32f);

        ctxFft = (IppiFFTSpec_R_32f *)pBuffer;
        Ipp8u *pMemInit = ippsMalloc_8u(pCfg->sizeFftSpecBuf);
        status = ippiFFTInit_R_32f(pCfg->orderX, pCfg->orderY, IPP_FFT_DIV_INV_BY_N, ippAlgHintNone, ctxFft, pMemInit);
        if (status < ippStsNoErr)
            return status;

        ippGetNumThreads_T(&nThreads);

        stsBufSize = ((nThreads + 1) * sizeof(status) + 15) & (~15) >> 2;
        thrMemSize = (pCfg->sizeWrkBuf / 2) + pCfg->sizeA + pCfg->sizeFftBuf;
        pMemBuf = ippsMalloc_32f((pCfg->sizeWrkBuf / 2) + stsBufSize + nThreads * thrMemSize);
        if (NULL != pMemBuf) {
            tplFft = pMemBuf;
            my_pBufFft = (Ipp8u *)(pMemBuf + (pCfg->sizeWrkBuf / 2) + stsBufSize);
            stsBuf = (IppStatus *)(pMemBuf + (pCfg->sizeWrkBuf / 2));

            switch (pCfg->srcDataType) {
            case ipp32f:
                owniClipRectZeroTail_32f_C1R((Ipp32f *)pTpl, tplStep, tplRoiSize, tplFft, fftS);
                break;
            case ipp16u:
                owniClipRectZeroTail_16u32f_C1R((Ipp16u *)pTpl, tplStep, tplRoiSize, tplFft, fftS);
                break;
            case ipp8u:
                owniClipRectZeroTail_8u32f_C1R((Ipp8u *)pTpl, tplStep, tplRoiSize, tplFft, fftS);
                break;
            }

            switch (normOp) {
            case ippiNorm:
                ippiNorm_L2_32f_C1R(tplFft, fStep, tplRoiSize, &tplNormL2, ippAlgHintAccurate);
                sqrNorm = (Ipp32f)(tplNormL2 * tplNormL2);
                thresh = THRESH_32F;
                if (tplNormL2 < thresh)
                    tplNormL2 = thresh;
                tplNorm = (Ipp32f)tplNormL2 * mpy;
                break;
            case ippiNormCoefficient:
                ippiNorm_L2_32f_C1R(tplFft, fStep, tplRoiSize, &tplNormL2, ippAlgHintAccurate);
                ippiMean_32f_C1R(tplFft, fStep, tplRoiSize, &tplMean64, ippAlgHintAccurate);
                tplMean = (Ipp32f)tplMean64;
                ippiSubC_32f_C1R(tplFft, fStep, tplMean, tplFft, fStep, tplRoiSize);
                sqrNorm = (Ipp32f)((tplNormL2 * tplNormL2 - (tplRoiSize.width * tplRoiSize.height) * tplMean64 * tplMean64));
                thresh = THRESH_32F;
                thresh = thresh * thresh;
                if (thresh > sqrNorm)
                    sqrNorm = thresh;
                sqrNorm = sqrNorm * mpy * mpy;
                thresh *= tplRoiSize.width * tplRoiSize.height;
                break;
            default:
                break; // ippiNormNone
            }

            stsBuf[0] = owniFFTFwd_RToPack_32f_C1R(tplFft, fStep, tplFft, fStep, ctxFft, my_pBufFft, 0, tplRoiSize.height);
            owniRCPack2DConj_32f_C1IR(tplFft, fStep, fftS);

            h = dstSize.height / onceH;
            w = dstSize.width / onceW;
            if ((dstSize.height % onceH) > 0)
                ++h;
            if ((dstSize.width % onceW) > 0)
                ++w;
            nFrames = h * w;
        }

        ippiCrossCorrNorm_T_Str ts;
        CrossCorrNormThreadingStructureEncode(pCfg, stsBuf, nThreads, nFrames, w, onceW, onceH, shftW, shftH, pMemBuf, tplFft, stsBufSize, thrMemSize,
                                              dstSize, srcRoiSize, tplRoiSize, fftS, srcDataSize, dstDataSize, pSrc, pDst, tplNorm, sqrNorm, thresh,
                                              srcStep, aStep, fStep, dstStep, scaleFactor, ctxFft, pAuto, pDeno, normOp, &ts);
        status = ippParallelFor_T(nFrames, (void *)&ts, ippiCrossCorrNorm_FullSame_32f_C1R_T_Fun);

        ippsFree(pMemBuf);

        return status;
    } else {
        IppEnum funCfg = (pCfg->algType);
        switch (pCfg->srcDataType) {
        case ipp32f:
            return ippiCrossCorrNorm_32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
        case ipp16u:
            return ippiCrossCorrNorm_16u32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
        case ipp8u:
            switch (pCfg->dstDataType) {
            case ipp32f:
                return ippiCrossCorrNorm_8u32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
            case ipp8u:
                return ippiCrossCorrNorm_8u_C1RSfs(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, scaleFactor, funCfg, pBuffer);
            default:
                return ippStsDataTypeErr;
            }
        default:
            return ippStsDataTypeErr;
        }
    }
}

IppStatus owniCrossCorrNorm_Valid_32f_C1R_T(const void *pSrc, int srcStep, const void *pTpl, int tplStep, void *pDst, int dstStep, int scaleFactor,
                                            OwniCrossCorrConfig *pCfg, Ipp8u *pBuffer)
{
    IppiNormOp normOp = (IppiNormOp)(pCfg->algType & ippiNormMask);
    IppAlgType algType = (IppAlgType)(pCfg->algType & ippAlgMask);

    IppiSize srcRoiSize, tplRoiSize, dstSize;
    srcRoiSize = pCfg->srcRoiSize;
    tplRoiSize = pCfg->tplRoiSize;
    dstSize = pCfg->dstRoiSize;

    int srcDataSize, dstDataSize;
    Ipp32f mpy;

    switch (pCfg->srcDataType) {
    case ipp32f:
        srcDataSize = (int)sizeof(Ipp32f);
        break;
    case ipp16u:
        srcDataSize = (int)sizeof(Ipp16u);
        break;
    case ipp8u:
        srcDataSize = (int)sizeof(Ipp8u);
        break;
    default:
        return ippStsDataTypeErr;
    }
    switch (pCfg->dstDataType) {
    case ipp32f:
        dstDataSize = (int)sizeof(Ipp32f);
        mpy = 1.f;
        break;
    case ipp8u:
        dstDataSize = (int)sizeof(Ipp8u);
        mpy = get_mpy(-scaleFactor);
        break;
    default:
        return ippStsDataTypeErr;
    }

    if ((normOp == ippiNormCoefficient) && (algType != ippAlgFFT)) {
        if ((((dstSize.width <= CROSSCORRVALID_FFT_CRITERION_OUTSIZE) || (tplRoiSize.width <= CROSSCORRVALID_FFT_CRITERION_TPLSIZE) ||
              (srcRoiSize.width <= CROSSCORRVALID_FFT_CRITERION_SRCSIZE))) &&
            (dstSize.width <= MAX_DIRECT_DIMENSION_SIZE) && (tplRoiSize.width * tplRoiSize.height < CROSSCORRVALID_MAX_DIRECT_TPL_SIZE)) {
            IppEnum funCfg = (IppEnum)(algType | normOp | ippiROIValid);
            switch (pCfg->srcDataType) {
            case ipp32f:
                return ippiCrossCorrNorm_32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
            case ipp16u:
                return ippiCrossCorrNorm_16u32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
            case ipp8u:
                switch (pCfg->dstDataType) {
                case ipp32f:
                    return ippiCrossCorrNorm_8u32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
                case ipp8u:
                    return ippiCrossCorrNorm_8u_C1RSfs(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, scaleFactor, funCfg,
                                                       pBuffer);
                }
            }
        }
    }

    if (srcRoiSize.width * srcRoiSize.height > _CORRVALID_N_BLOCK_ALG * tplRoiSize.width * tplRoiSize.height) {
        IppStatus status, *stsBuf = NULL;
        int onceW, onceH;
        Ipp8u *pBufFft = NULL;
        Ipp32f *pMemBuf = NULL, thresh = THRESH_32F;
        Ipp32f *tplFft = NULL, *frameFft, *aBuf, tplNorm = 0, sqrNorm = 0, tplMean;
        IppiFFTSpec_R_32f *ctxFft;
        void *curSrc, *curDst;
        int fStep, aStep;
        Ipp64f tplNormL2, tplMean64, *fBuf;
        int w = 0, h, nFrames = 0, thrMemSize, stsBufSize, nThreads = 0;
        void (*pAuto)(const Ipp32f *, int, IppiSize, Ipp32f *, int, IppiSize, Ipp64f *, Ipp32f *, Ipp32f *);
        void (*pDeno)(const Ipp32f *, int, IppiSize, Ipp32f *, int, IppiSize, Ipp64f *, int, Ipp32f *, Ipp32f *);

        pAuto = owniAutoCorr_C1R;
        pDeno = owniDenominator_C1R;

        IppiSize fftS;

        fftS = pCfg->fftRoiSize;
        fStep = fftS.width * sizeof(Ipp32f);

        onceH = pCfg->frameRoiSize.height;
        onceW = pCfg->frameRoiSize.width;
        aStep = pCfg->frameRoiSize.width * sizeof(Ipp32f);

        ctxFft = (IppiFFTSpec_R_32f *)pBuffer;
        Ipp8u *pMemInit = ippsMalloc_8u(pCfg->sizeFftSpecBuf);
        status = ippiFFTInit_R_32f(pCfg->orderX, pCfg->orderY, IPP_FFT_DIV_INV_BY_N, ippAlgHintNone, ctxFft, pMemInit);
        if (status < ippStsNoErr)
            return status;

        ippGetNumThreads_T(&nThreads);

        stsBufSize = ((nThreads + 1) * sizeof(status) + 15) & (~15) >> 2;
        thrMemSize = (pCfg->sizeWrkBuf / 2) + pCfg->sizeA + pCfg->sizeFftBuf;
        pMemBuf = ippsMalloc_32f((pCfg->sizeWrkBuf / 2) + stsBufSize + nThreads * thrMemSize);
        if (NULL != pMemBuf) {
            tplFft = pMemBuf;
            pBufFft = (Ipp8u *)(pMemBuf + (pCfg->sizeWrkBuf / 2) + stsBufSize);
            stsBuf = (IppStatus *)(pMemBuf + (pCfg->sizeWrkBuf / 2));

            switch (pCfg->srcDataType) {
            case ipp32f:
                owniClipRectZeroTail_32f_C1R((Ipp32f *)pTpl, tplStep, tplRoiSize, tplFft, fftS);
                break;
            case ipp16u:
                owniClipRectZeroTail_16u32f_C1R((Ipp16u *)pTpl, tplStep, tplRoiSize, tplFft, fftS);
                break;
            case ipp8u:
                owniClipRectZeroTail_8u32f_C1R((Ipp8u *)pTpl, tplStep, tplRoiSize, tplFft, fftS);
                break;
            }

            switch (normOp) {
            case ippiNorm:
                ippiNorm_L2_32f_C1R(tplFft, fStep, tplRoiSize, &tplNormL2, ippAlgHintAccurate);
                thresh = THRESH_32F; // it isn't matter 32F or 8U
                if (tplNormL2 < thresh)
                    tplNormL2 = thresh;
                tplNorm = (Ipp32f)tplNormL2 * mpy;
                break;
            case ippiNormCoefficient:
                ippiNorm_L2_32f_C1R(tplFft, fStep, tplRoiSize, &tplNormL2, ippAlgHintAccurate);
                ippiMean_32f_C1R(tplFft, fStep, tplRoiSize, &tplMean64, ippAlgHintAccurate);
                tplMean = (Ipp32f)tplMean64;
                ippiSubC_32f_C1R(tplFft, fStep, tplMean, tplFft, fStep, tplRoiSize);
                sqrNorm = (Ipp32f)((tplNormL2 * tplNormL2 - (tplRoiSize.width * tplRoiSize.height) * tplMean64 * tplMean64));
                thresh = THRESH_32F;
                thresh = thresh * thresh;
                if (thresh > sqrNorm)
                    sqrNorm = thresh;
                sqrNorm = sqrNorm * mpy * mpy;
                break;
            default:
                break; // ippiNormNone
            }

            stsBuf[0] = owniFFTFwd_RToPack_32f_C1R(tplFft, fStep, tplFft, fStep, ctxFft, pBufFft, 0, tplRoiSize.height);
            owniRCPack2DConj_32f_C1IR(tplFft, fStep, fftS);

            h = dstSize.height / onceH;
            w = dstSize.width / onceW;
            if ((dstSize.height % onceH) > 0)
                ++h;
            if ((dstSize.width % onceW) > 0)
                ++w;
            nFrames = h * w;
        }

        ippiCrossCorrNorm_T_Str ts;
        CrossCorrNormThreadingStructureEncode(pCfg, stsBuf, nThreads, nFrames, w, onceW, onceH, 0, 0, pMemBuf, tplFft, stsBufSize, thrMemSize,
                                              dstSize, srcRoiSize, tplRoiSize, fftS, srcDataSize, dstDataSize, pSrc, pDst, tplNorm, sqrNorm, thresh,
                                              srcStep, aStep, fStep, dstStep, scaleFactor, ctxFft, pAuto, pDeno, normOp, &ts);
        status = ippParallelFor_T(nFrames, (void *)&ts, ippiCrossCorrNorm_Valid_32f_C1R_T_Fun);

        ippsFree(pMemBuf);
        return status;
    } else {
        IppEnum funCfg = (pCfg->algType);
        switch (pCfg->srcDataType) {
        case ipp32f:
            return ippiCrossCorrNorm_32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
        case ipp16u:
            return ippiCrossCorrNorm_16u32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
        case ipp8u:
            switch (pCfg->dstDataType) {
            case ipp32f:
                return ippiCrossCorrNorm_8u32f_C1R(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, funCfg, pBuffer);
            case ipp8u:
                return ippiCrossCorrNorm_8u_C1RSfs(pSrc, srcStep, srcRoiSize, pTpl, tplStep, tplRoiSize, pDst, dstStep, scaleFactor, funCfg, pBuffer);
            default:
                return ippStsDataTypeErr;
            }
        default:
            return ippStsDataTypeErr;
        }
    }
}

/* ////////////////////////////////////////////////////////////////////////////
//  Names: ippiCrossCorrNorm_32f_C1R_T
//         ippiCrossCorrNorm_16u32f_C1R_T
//         ippiCrossCorrNorm_8u32f_C1R_T
//         ippiCrossCorrNorm_8u_C1RSfs_T
//  Purpose: Computes normalized cross-correlation between an image and a template.
//           The result image size depends on operation shape selected in algType mask as follows :
//             (Wa+Wb-1)*(Ha+Hb-1) for ippiROIFull mask,
//             (Wa)*(Ha)           for ippiROISame mask,
//             (Wa-Wb+1)*(Ha-Hb+1) for ippiROIValid mask,
//           where Wa*Ha and Wb*Hb are the sizes of the image and template correspondingly.
//           Support of normalization operations (set in the algType mask) is set by selecting the following masks:
//             ippiNormNone   - the cross-correlation without normalization.
//             ippiNorm - the normalized cross-correlation.
//             ippiNormCoefficient  - the normalized correlation coefficients.
//           If the IppAlgMask value in algType is equal to ippAlgAuto, the optimal algorithm is selected automatically.
//           For big data size, the function uses 2D FFT algorithm.
//  Parameters:
//    pSrc        - Pointer to the source image ROI.
//    srcStep     - Distance, in bytes, between the starting points of consecutive lines in the source image.
//    srcRoiSize  - Size of the source ROI in pixels.
//    pTpl        - Pointer to the template image.
//    tplStep     - Distance, in bytes, between the starting points of consecutive lines in the template image.
//    tplRoiSize  - Size of the template ROI in pixels.
//    pDst        - Pointer to the destination image ROI.
//    dstStep     - Distance, in bytes, between the starting points of consecutive lines in the destination image.
//    scaleFactor - Scale factor.
//    algType     - Bit-field mask for the algorithm type definition. Possible values are the results of composition of the IppAlgType, IppiROIShape,
and IppiNormOp values.
//                  Usage example: algType=(ippiROIFull|ippAlgFFT|ippiNormNone); - full-shaped cross-correlation will be calculated using 2D FFT
without result normalization.
//    pBuffer     - Pointer to the work buffer.
//  Returns:
//    ippStsNoErr      OK.
//    ippStsNullPtrErr Error when any of the specified pointers is NULL.
//    ippStsStepErr    Error when the value of srcStep, tplStep, or dstStep is negative, or equal to zero.
//    ippStsSizeErr    Error when :
//                         srcRoiSize or tplRoiSize is negative, or equal to zero.
//                         The value of srcRoiSize is less than the corresponding value of tplRoiSize.
//    ippStsAlgTypeErr Error when :
//                         The result of the bitwise AND operation between the algType and ippAlgMask differs from the ippAlgAuto, ippAlgDirect, or
ippAlgFFT values.
//                         The result of the bitwise AND operation between the algType and ippiROIMask differs from the ippiROIFull, ippiROISame, or
ippiROIValid values.
//                         The result of the bitwise AND operation between the algType and ippiNormMask differs from the ippiNormNone, ippiNorm, or
ippiNormCoefficient values.
*/

IPPFUN(IppStatus, ippiCrossCorrNorm_32f_C1R_T,
       (const Ipp32f *pSrc, int srcStep, IppiSize srcRoiSize, const Ipp32f *pTpl, int tplStep, IppiSize tplRoiSize, Ipp32f *pDst, int dstStep,
        IppEnum funCfg, Ipp8u *pBuffer))
{
    IppAlgType algType = (IppAlgType)(funCfg & ippAlgMask);
    IppiROIShape shape = (IppiROIShape)(funCfg & ippiROIMask);
    IppiNormOp norm = (IppiNormOp)(funCfg & ippiNormMask);

    if (pSrc == NULL || pTpl == NULL || pDst == NULL)
        return ippStsNullPtrErr;
    if (srcRoiSize.width <= 0 || srcRoiSize.height <= 0 || tplRoiSize.width <= 0 || tplRoiSize.height <= 0)
        return ippStsSizeErr;
    if (srcRoiSize.width - tplRoiSize.width + 1 <= 0 || srcRoiSize.height - tplRoiSize.height + 1 <= 0)
        return ippStsSizeErr;
    if (srcStep <= 0 || tplStep <= 0 || dstStep <= 0)
        return ippStsStepErr;
    if (pBuffer == NULL)
        return ippStsNullPtrErr;
    switch (algType) {
    case ippAlgAuto:
    case ippAlgDirect:
    case ippAlgFFT:
        break;
    default:
        return ippStsAlgTypeErr;
    }
    switch (shape) {
    case ippiROIFull:
    case ippiROISame:
    case ippiROIValid:
        break;
    default:
        return ippStsAlgTypeErr;
    }
    switch (norm) {
    case ippiNormNone:
    case ippiNorm:
    case ippiNormCoefficient:
        break;
    default:
        return ippStsAlgTypeErr;
    }

    {
        IppStatus status;
        OwniCrossCorrConfig objCfg;
        objCfg.algType = funCfg;
        objCfg.srcRoiSize = srcRoiSize;
        objCfg.tplRoiSize = tplRoiSize;
        objCfg.srcDataType = ipp32f;
        objCfg.dstDataType = ipp32f;

        owniCrossCorrNormGetSize(&objCfg);

        switch (shape) {
        case ippiROIValid:
            status = owniCrossCorrNorm_Valid_32f_C1R_T(pSrc, srcStep, pTpl, tplStep, pDst, dstStep, 0, &objCfg, pBuffer);
            break;
        default:
            status = owniCrossCorrNorm_FullSame_32f_C1R_T(pSrc, srcStep, pTpl, tplStep, pDst, dstStep, 0, &objCfg, pBuffer);
            break;
        }

        return status;
    }
}

IPPFUN(IppStatus, ippiCrossCorrNorm_16u32f_C1R_T,
       (const Ipp16u *pSrc, int srcStep, IppiSize srcRoiSize, const Ipp16u *pTpl, int tplStep, IppiSize tplRoiSize, Ipp32f *pDst, int dstStep,
        IppEnum funCfg, Ipp8u *pBuffer))
{
    IppAlgType algType = (IppAlgType)(funCfg & ippAlgMask);
    IppiROIShape shape = (IppiROIShape)(funCfg & ippiROIMask);
    IppiNormOp norm = (IppiNormOp)(funCfg & ippiNormMask);

    if (pSrc == NULL || pTpl == NULL || pDst == NULL)
        return ippStsNullPtrErr;
    if (srcRoiSize.width <= 0 || srcRoiSize.height <= 0 || tplRoiSize.width <= 0 || tplRoiSize.height <= 0)
        return ippStsSizeErr;
    if (srcRoiSize.width - tplRoiSize.width + 1 <= 0 || srcRoiSize.height - tplRoiSize.height + 1 <= 0)
        return ippStsSizeErr;
    if (srcStep <= 0 || tplStep <= 0 || dstStep <= 0)
        return ippStsStepErr;
    if (pBuffer == NULL)
        return ippStsNullPtrErr;
    switch (algType) {
    case ippAlgAuto:
    case ippAlgDirect:
    case ippAlgFFT:
        break;
    default:
        return ippStsAlgTypeErr;
    }
    switch (shape) {
    case ippiROIFull:
    case ippiROISame:
    case ippiROIValid:
        break;
    default:
        return ippStsAlgTypeErr;
    }
    switch (norm) {
    case ippiNormNone:
    case ippiNorm:
    case ippiNormCoefficient:
        break;
    default:
        return ippStsAlgTypeErr;
    }

    {
        IppStatus status;
        OwniCrossCorrConfig objCfg;
        objCfg.algType = funCfg;
        objCfg.srcRoiSize = srcRoiSize;
        objCfg.tplRoiSize = tplRoiSize;
        objCfg.srcDataType = ipp16u;
        objCfg.dstDataType = ipp32f;

        owniCrossCorrNormGetSize(&objCfg);

        switch (shape) {
        case ippiROIValid:
            status = owniCrossCorrNorm_Valid_32f_C1R_T(pSrc, srcStep, pTpl, tplStep, pDst, dstStep, 0, &objCfg, pBuffer);
            break;
        default:
            status = owniCrossCorrNorm_FullSame_32f_C1R_T(pSrc, srcStep, pTpl, tplStep, pDst, dstStep, 0, &objCfg, pBuffer);
            break;
        }

        return status;
    }
}

IPPFUN(IppStatus, ippiCrossCorrNorm_8u32f_C1R_T,
       (const Ipp8u *pSrc, int srcStep, IppiSize srcRoiSize, const Ipp8u *pTpl, int tplStep, IppiSize tplRoiSize, Ipp32f *pDst, int dstStep,
        IppEnum funCfg, Ipp8u *pBuffer))
{
    IppAlgType algType = (IppAlgType)(funCfg & ippAlgMask);
    IppiROIShape shape = (IppiROIShape)(funCfg & ippiROIMask);
    IppiNormOp norm = (IppiNormOp)(funCfg & ippiNormMask);

    if (pSrc == NULL || pTpl == NULL || pDst == NULL)
        return ippStsNullPtrErr;
    if (srcRoiSize.width <= 0 || srcRoiSize.height <= 0 || tplRoiSize.width <= 0 || tplRoiSize.height <= 0)
        return ippStsSizeErr;
    if (srcRoiSize.width - tplRoiSize.width + 1 <= 0 || srcRoiSize.height - tplRoiSize.height + 1 <= 0)
        return ippStsSizeErr;
    if (srcStep <= 0 || tplStep <= 0 || dstStep <= 0)
        return ippStsStepErr;
    if (pBuffer == NULL)
        return ippStsNullPtrErr;
    switch (algType) {
    case ippAlgAuto:
    case ippAlgDirect:
    case ippAlgFFT:
        break;
    default:
        return ippStsAlgTypeErr;
    }
    switch (shape) {
    case ippiROIFull:
    case ippiROISame:
    case ippiROIValid:
        break;
    default:
        return ippStsAlgTypeErr;
    }
    switch (norm) {
    case ippiNormNone:
    case ippiNorm:
    case ippiNormCoefficient:
        break;
    default:
        return ippStsAlgTypeErr;
    }

    {
        IppStatus status;
        OwniCrossCorrConfig objCfg;
        objCfg.algType = funCfg;
        objCfg.srcRoiSize = srcRoiSize;
        objCfg.tplRoiSize = tplRoiSize;
        objCfg.srcDataType = ipp8u;
        objCfg.dstDataType = ipp32f;

        owniCrossCorrNormGetSize(&objCfg);

        switch (shape) {
        case ippiROIValid:
            status = owniCrossCorrNorm_Valid_32f_C1R_T(pSrc, srcStep, pTpl, tplStep, pDst, dstStep, 0, &objCfg, pBuffer);
            break;
        default:
            status = owniCrossCorrNorm_FullSame_32f_C1R_T(pSrc, srcStep, pTpl, tplStep, pDst, dstStep, 0, &objCfg, pBuffer);
            break;
        }

        return status;
    }
}

IPPFUN(IppStatus, ippiCrossCorrNorm_8u_C1RSfs_T,
       (const Ipp8u *pSrc, int srcStep, IppiSize srcRoiSize, const Ipp8u *pTpl, int tplStep, IppiSize tplRoiSize, Ipp8u *pDst, int dstStep,
        int scaleFactor, IppEnum funCfg, Ipp8u *pBuffer))
{
    IppAlgType algType = (IppAlgType)(funCfg & ippAlgMask);
    IppiROIShape shape = (IppiROIShape)(funCfg & ippiROIMask);
    IppiNormOp norm = (IppiNormOp)(funCfg & ippiNormMask);

    if (pSrc == NULL || pTpl == NULL || pDst == NULL)
        return ippStsNullPtrErr;
    if (srcRoiSize.width <= 0 || srcRoiSize.height <= 0 || tplRoiSize.width <= 0 || tplRoiSize.height <= 0)
        return ippStsSizeErr;
    if (srcRoiSize.width - tplRoiSize.width + 1 <= 0 || srcRoiSize.height - tplRoiSize.height + 1 <= 0)
        return ippStsSizeErr;
    if (srcStep <= 0 || tplStep <= 0 || dstStep <= 0)
        return ippStsStepErr;
    if (pBuffer == NULL)
        return ippStsNullPtrErr;
    switch (algType) {
    case ippAlgAuto:
    case ippAlgDirect:
    case ippAlgFFT:
        break;
    default:
        return ippStsAlgTypeErr;
    }
    switch (shape) {
    case ippiROIFull:
    case ippiROISame:
    case ippiROIValid:
        break;
    default:
        return ippStsAlgTypeErr;
    }
    switch (norm) {
    case ippiNormNone:
    case ippiNorm:
    case ippiNormCoefficient:
        break;
    default:
        return ippStsAlgTypeErr;
    }

    {
        IppStatus status;
        OwniCrossCorrConfig objCfg;
        objCfg.algType = funCfg;
        objCfg.srcRoiSize = srcRoiSize;
        objCfg.tplRoiSize = tplRoiSize;
        objCfg.srcDataType = ipp8u;
        objCfg.dstDataType = ipp8u;

        owniCrossCorrNormGetSize(&objCfg);

        switch (shape) {
        case ippiROIValid:
            status = owniCrossCorrNorm_Valid_32f_C1R_T(pSrc, srcStep, pTpl, tplStep, pDst, dstStep, scaleFactor, &objCfg, pBuffer);
            break;
        default:
            status = owniCrossCorrNorm_FullSame_32f_C1R_T(pSrc, srcStep, pTpl, tplStep, pDst, dstStep, scaleFactor, &objCfg, pBuffer);
            break;
        }

        return status;
    }
}
