Changeset 56 in 3DVCSoftware for trunk/source/Lib/TLibCommon/TComRdCost.cpp


Ignore:
Timestamp:
11 May 2012, 21:20:17 (12 years ago)
Author:
hschwarz
Message:

updated trunk (move to HM6.1)

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/source/Lib/TLibCommon/TComRdCost.cpp

    r5 r56  
    22 * License, included below. This software may be subject to other third party
    33 * and contributor rights, including patent rights, and no such rights are
    4  * granted under this license.
     4 * granted under this license. 
    55 *
    6  * Copyright (c) 2010-2011, ISO/IEC
     6 * Copyright (c) 2010-2012, ITU/ISO/IEC
    77 * All rights reserved.
    88 *
     
    1515 *    this list of conditions and the following disclaimer in the documentation
    1616 *    and/or other materials provided with the distribution.
    17  *  * Neither the name of the ISO/IEC nor the names of its contributors may
     17 *  * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
    1818 *    be used to endorse or promote products derived from this software without
    1919 *    specific prior written permission.
     
    3232 */
    3333
    34 
    35 
    3634/** \file     TComRdCost.cpp
    3735    \brief    RD cost computation class
     
    4038#include <math.h>
    4139#include <assert.h>
     40#include "TComRom.h"
    4241#include "TComRdCost.h"
    4342#include "TComDataCU.h"
    4443
     44//! \ingroup TLibCommon
     45//! \{
    4546
    4647TComRdCost::TComRdCost()
     
    5152TComRdCost::~TComRdCost()
    5253{
     54#if !FIX203
    5355  xUninit();
     56#endif
    5457}
    5558
    5659// Calculate RD functions
    57 Double TComRdCost::calcRdCost( UInt uiBits, Dist uiDistortion, Bool bFlag, DFunc eDFunc )
     60Double TComRdCost::calcRdCost( UInt uiBits, UInt uiDistortion, Bool bFlag, DFunc eDFunc )
    5861{
    5962  Double dRdCost = 0.0;
     
    8588  {
    8689    // Intra8x8, Intra4x4 Block only...
     90#if LOSSLESS_CODING && SEQUENCE_LEVEL_LOSSLESS
     91    dRdCost = (Double)(uiBits);
     92#else
    8793    dRdCost = (((Double)uiDistortion) + ((Double)uiBits * dLambda));
     94#endif
    8895  }
    8996  else
     
    9299    {
    93100      dRdCost = ((Double)uiDistortion + (Double)((Int)(uiBits * dLambda+.5)>>16));
    94       dRdCost = (Double)(Dist)floor(dRdCost);
     101      dRdCost = (Double)(UInt)floor(dRdCost);
    95102    }
    96103    else
    97104    {
     105#if LOSSLESS_CODING && SEQUENCE_LEVEL_LOSSLESS
     106      dRdCost = (Double)(uiBits);
     107#else
    98108      dRdCost = ((Double)uiDistortion + (Double)((Int)(uiBits * dLambda+.5)));
    99       dRdCost = (Double)(Dist)floor(dRdCost);
     109      dRdCost = (Double)(UInt)floor(dRdCost);
     110#endif
    100111    }
    101112  }
     
    131142  {
    132143    // Intra8x8, Intra4x4 Block only...
     144#if LOSSLESS_CODING && SEQUENCE_LEVEL_LOSSLESS
     145    dRdCost = (Double)(uiBits);
     146#else
    133147    dRdCost = (((Double)(Int64)uiDistortion) + ((Double)(Int64)uiBits * dLambda));
     148#endif
    134149  }
    135150  else
     
    142157    else
    143158    {
     159#if LOSSLESS_CODING && SEQUENCE_LEVEL_LOSSLESS
     160      dRdCost = (Double)(uiBits);
     161#else
    144162      dRdCost = ((Double)(Int64)uiDistortion + (Double)((Int)((Int64)uiBits * dLambda+.5)));
    145163      dRdCost = (Double)(UInt)floor(dRdCost);
     164#endif
    146165    }
    147166  }
     
    166185}
    167186#endif
    168 
    169187
    170188// Initalize Function Pointer by [eDFunc]
     
    189207  m_afpDistortFunc[14] = TComRdCost::xGetSAD16N;
    190208 
    191   m_afpDistortFunc[15] = TComRdCost::xGetSADs;
    192   m_afpDistortFunc[16] = TComRdCost::xGetSADs4;
    193   m_afpDistortFunc[17] = TComRdCost::xGetSADs8;
    194   m_afpDistortFunc[18] = TComRdCost::xGetSADs16;
    195   m_afpDistortFunc[19] = TComRdCost::xGetSADs32;
    196   m_afpDistortFunc[20] = TComRdCost::xGetSADs64;
    197   m_afpDistortFunc[21] = TComRdCost::xGetSADs16N;
    198  
     209  m_afpDistortFunc[15] = TComRdCost::xGetSAD;
     210  m_afpDistortFunc[16] = TComRdCost::xGetSAD4;
     211  m_afpDistortFunc[17] = TComRdCost::xGetSAD8;
     212  m_afpDistortFunc[18] = TComRdCost::xGetSAD16;
     213  m_afpDistortFunc[19] = TComRdCost::xGetSAD32;
     214  m_afpDistortFunc[20] = TComRdCost::xGetSAD64;
     215  m_afpDistortFunc[21] = TComRdCost::xGetSAD16N;
     216 
     217#if AMP_SAD
     218  m_afpDistortFunc[43] = TComRdCost::xGetSAD12;
     219  m_afpDistortFunc[44] = TComRdCost::xGetSAD24;
     220  m_afpDistortFunc[45] = TComRdCost::xGetSAD48;
     221
     222  m_afpDistortFunc[46] = TComRdCost::xGetSAD12;
     223  m_afpDistortFunc[47] = TComRdCost::xGetSAD24;
     224  m_afpDistortFunc[48] = TComRdCost::xGetSAD48;
     225#endif
    199226  m_afpDistortFunc[22] = TComRdCost::xGetHADs;
    200 #ifdef DCM_RDCOST_TEMP_FIX //Temporary fix since xGetHADs4 and xGetHADs8 assume that the row size cannot be 1, 2, 3 or 6 when the column size is 4 or 8.
    201227  m_afpDistortFunc[23] = TComRdCost::xGetHADs;
    202228  m_afpDistortFunc[24] = TComRdCost::xGetHADs;
    203 #else
    204   m_afpDistortFunc[23] = TComRdCost::xGetHADs4;
    205   m_afpDistortFunc[24] = TComRdCost::xGetHADs8;
    206 #endif
    207229  m_afpDistortFunc[25] = TComRdCost::xGetHADs;
    208230  m_afpDistortFunc[26] = TComRdCost::xGetHADs;
     
    210232  m_afpDistortFunc[28] = TComRdCost::xGetHADs;
    211233 
    212 #ifdef ROUNDING_CONTROL_BIPRED
    213   m_afpDistortFuncRnd[0]  = NULL;
    214   m_afpDistortFuncRnd[1]  = TComRdCost::xGetSSE;
    215   m_afpDistortFuncRnd[2]  = TComRdCost::xGetSSE4;
    216   m_afpDistortFuncRnd[3]  = TComRdCost::xGetSSE8;
    217   m_afpDistortFuncRnd[4]  = TComRdCost::xGetSSE16;
    218   m_afpDistortFuncRnd[5]  = TComRdCost::xGetSSE32;
    219   m_afpDistortFuncRnd[6]  = TComRdCost::xGetSSE64;
    220   m_afpDistortFuncRnd[7]  = TComRdCost::xGetSSE16N;
    221  
    222   m_afpDistortFuncRnd[8]  = TComRdCost::xGetSAD;
    223   m_afpDistortFuncRnd[9]  = TComRdCost::xGetSAD4;
    224   m_afpDistortFuncRnd[10] = TComRdCost::xGetSAD8;
    225   m_afpDistortFuncRnd[11] = TComRdCost::xGetSAD16;
    226   m_afpDistortFuncRnd[12] = TComRdCost::xGetSAD32;
    227   m_afpDistortFuncRnd[13] = TComRdCost::xGetSAD64;
    228   m_afpDistortFuncRnd[14] = TComRdCost::xGetSAD16N;
    229  
    230   m_afpDistortFuncRnd[15] = TComRdCost::xGetSADs;
    231   m_afpDistortFuncRnd[16] = TComRdCost::xGetSADs4;
    232   m_afpDistortFuncRnd[17] = TComRdCost::xGetSADs8;
    233   m_afpDistortFuncRnd[18] = TComRdCost::xGetSADs16;
    234   m_afpDistortFuncRnd[19] = TComRdCost::xGetSADs32;
    235   m_afpDistortFuncRnd[20] = TComRdCost::xGetSADs64;
    236   m_afpDistortFuncRnd[21] = TComRdCost::xGetSADs16N;
    237  
    238   m_afpDistortFuncRnd[22] = TComRdCost::xGetHADs;
    239   m_afpDistortFuncRnd[23] = TComRdCost::xGetHADs4;
    240   m_afpDistortFuncRnd[24] = TComRdCost::xGetHADs8;
    241   m_afpDistortFuncRnd[25] = TComRdCost::xGetHADs;
    242   m_afpDistortFuncRnd[26] = TComRdCost::xGetHADs;
    243   m_afpDistortFuncRnd[27] = TComRdCost::xGetHADs;
    244   m_afpDistortFuncRnd[28] = TComRdCost::xGetHADs;
    245 #endif
    246  
     234#if !FIX203
    247235  m_puiComponentCostOriginP = NULL;
    248236  m_puiComponentCost        = NULL;
    249237  m_puiVerCost              = NULL;
    250238  m_puiHorCost              = NULL;
     239#endif
    251240  m_uiCost                  = 0;
    252241  m_iCostScale              = 0;
     242#if !FIX203
    253243  m_iSearchLimit            = 0xdeaddead;
    254 
    255   m_puiMultiviewRegCostHorOrgP  = 0;
    256   m_puiMultiviewRegCostVerOrgP  = 0;
    257   m_puiMultiviewRegCostHor      = 0;
    258   m_puiMultiviewRegCostVer      = 0;
    259244
    260245#if HHI_VSO
     
    271256  m_dLambdaScale            = 1;
    272257#endif
    273 }
    274 
     258#endif
     259}
     260
     261#if !FIX203
    275262Void TComRdCost::initRateDistortionModel( Int iSubPelSearchLimit )
    276263{
     
    285272    m_iSearchLimit = iSubPelSearchLimit;
    286273   
    287     m_puiComponentCostOriginP     = new UInt[ 4 * iSubPelSearchLimit ];
    288     m_puiMultiviewRegCostHorOrgP  = new UInt[ 4 * iSubPelSearchLimit ];
    289     m_puiMultiviewRegCostVerOrgP  = new UInt[ 4 * iSubPelSearchLimit ];
     274    m_puiComponentCostOriginP = new UInt[ 4 * iSubPelSearchLimit ];
    290275    iSubPelSearchLimit *= 2;
    291276   
    292     m_puiComponentCost       = m_puiComponentCostOriginP    + iSubPelSearchLimit;
    293     m_puiMultiviewRegCostHor = m_puiMultiviewRegCostHorOrgP + iSubPelSearchLimit;
    294     m_puiMultiviewRegCostVer = m_puiMultiviewRegCostVerOrgP + iSubPelSearchLimit;
     277    m_puiComponentCost = m_puiComponentCostOriginP + iSubPelSearchLimit;
    295278   
    296279    for( Int n = -iSubPelSearchLimit; n < iSubPelSearchLimit; n++)
    297280    {
    298       m_puiComponentCost      [n] = xGetComponentBits( n );
    299       m_puiMultiviewRegCostHor[n] = xGetComponentBits( n );  // first version
    300       m_puiMultiviewRegCostVer[n] = xGetComponentBits( n );  // first version
     281      m_puiComponentCost[n] = xGetComponentBits( n );
    301282    }
    302283  }
     
    310291    m_puiComponentCostOriginP = NULL;
    311292  }
    312 
    313   if( m_puiMultiviewRegCostHorOrgP )
    314   {
    315     delete [] m_puiMultiviewRegCostHorOrgP;
    316     m_puiMultiviewRegCostHorOrgP = NULL;
    317   }
    318   if( m_puiMultiviewRegCostVerOrgP )
    319   {
    320     delete [] m_puiMultiviewRegCostVerOrgP;
    321     m_puiMultiviewRegCostVerOrgP = NULL;
    322   }
    323 
    324 #if HHI_VSO
    325   if ( m_apRefPics != NULL )
    326   {
    327     delete[] m_apRefPics;
    328     m_apRefPics = NULL;
    329   }
    330 
    331   if ( m_paaiShiftLUTs != NULL ) { // Delete only first dimension, other dimension are not create in this class
    332     delete[] m_paaiShiftLUTs;
    333     m_paaiShiftLUTs = NULL;
    334   };
    335 #endif
    336 }
     293}
     294#endif
    337295
    338296UInt TComRdCost::xGetComponentBits( Int iVal )
     
    351309  return uiLength;
    352310}
    353 
    354 #ifdef ROUNDING_CONTROL_BIPRED
    355 // Setting the Distortion Parameter for Inter (ME)
    356 Void TComRdCost::setDistParam_Bi( TComPattern* pcPatternKey, Pel* piRefY, Int iRefStride, DistParam& rcDistParam )
    357 {
    358   // set Original & Curr Pointer / Stride
    359   rcDistParam.pOrg = pcPatternKey->getROIY();
    360   rcDistParam.pCur = piRefY;
    361  
    362   rcDistParam.iStrideOrg = pcPatternKey->getPatternLStride();
    363   rcDistParam.iStrideCur = iRefStride;
    364  
    365   // set Block Width / Height
    366   rcDistParam.iCols    = pcPatternKey->getROIYWidth();
    367   rcDistParam.iRows    = pcPatternKey->getROIYHeight();
    368   rcDistParam.DistFuncRnd = m_afpDistortFuncRnd[DF_SAD + g_aucConvertToBit[ rcDistParam.iCols ] + 1 ];
    369  
    370   // initialize
    371   rcDistParam.iSubShift  = 0;
    372 }
    373 
    374 // Setting the Distortion Parameter for Inter (subpel ME with step)
    375 Void TComRdCost::setDistParam_Bi( TComPattern* pcPatternKey, Pel* piRefY, Int iRefStride, Int iStep, DistParam& rcDistParam, Bool bHADME )
    376 {
    377   // set Original & Curr Pointer / Stride
    378   rcDistParam.pOrg = pcPatternKey->getROIY();
    379   rcDistParam.pCur = piRefY;
    380  
    381   rcDistParam.iStrideOrg = pcPatternKey->getPatternLStride();
    382   rcDistParam.iStrideCur = iRefStride * iStep;
    383  
    384   // set Step for interpolated buffer
    385   rcDistParam.iStep = iStep;
    386  
    387   // set Block Width / Height
    388   rcDistParam.iCols    = pcPatternKey->getROIYWidth();
    389   rcDistParam.iRows    = pcPatternKey->getROIYHeight();
    390  
    391   // set distortion function
    392   if ( !bHADME )
    393   {
    394     rcDistParam.DistFuncRnd = m_afpDistortFuncRnd[DF_SADS + g_aucConvertToBit[ rcDistParam.iCols ] + 1 ];
    395   }
    396   else
    397   {
    398     rcDistParam.DistFuncRnd = m_afpDistortFuncRnd[DF_HADS + g_aucConvertToBit[ rcDistParam.iCols ] + 1 ];
    399   }
    400  
    401   // initialize
    402   rcDistParam.iSubShift  = 0;
    403 }
    404 #endif
    405311
    406312Void TComRdCost::setDistParam( UInt uiBlkWidth, UInt uiBlkHeight, DFunc eDFunc, DistParam& rcDistParam )
     
    434340  rcDistParam.DistFunc = m_afpDistortFunc[DF_SAD + g_aucConvertToBit[ rcDistParam.iCols ] + 1 ];
    435341 
     342#if AMP_SAD
     343  if (rcDistParam.iCols == 12)
     344  {
     345    rcDistParam.DistFunc = m_afpDistortFunc[43 ];
     346  }
     347  else if (rcDistParam.iCols == 24)
     348  {
     349    rcDistParam.DistFunc = m_afpDistortFunc[44 ];
     350  }
     351  else if (rcDistParam.iCols == 48)
     352  {
     353    rcDistParam.DistFunc = m_afpDistortFunc[45 ];
     354  }
     355#endif
     356
    436357  // initialize
    437358  rcDistParam.iSubShift  = 0;
     
    443364
    444365// Setting the Distortion Parameter for Inter (subpel ME with step)
     366#if NS_HAD
     367Void TComRdCost::setDistParam( TComPattern* pcPatternKey, Pel* piRefY, Int iRefStride, Int iStep, DistParam& rcDistParam, Bool bHADME, Bool bUseNSHAD )
     368#else
    445369Void TComRdCost::setDistParam( TComPattern* pcPatternKey, Pel* piRefY, Int iRefStride, Int iStep, DistParam& rcDistParam, Bool bHADME )
     370#endif
    446371{
    447372  // set Original & Curr Pointer / Stride
     
    458383  rcDistParam.iCols    = pcPatternKey->getROIYWidth();
    459384  rcDistParam.iRows    = pcPatternKey->getROIYHeight();
     385#if NS_HAD
     386  rcDistParam.bUseNSHAD = bUseNSHAD;
     387#endif
    460388 
    461389  // set distortion function
     
    463391  {
    464392    rcDistParam.DistFunc = m_afpDistortFunc[DF_SADS + g_aucConvertToBit[ rcDistParam.iCols ] + 1 ];
     393#if AMP_SAD
     394    if (rcDistParam.iCols == 12)
     395    {
     396      rcDistParam.DistFunc = m_afpDistortFunc[46 ];
     397    }
     398    else if (rcDistParam.iCols == 24)
     399    {
     400      rcDistParam.DistFunc = m_afpDistortFunc[47 ];
     401    }
     402    else if (rcDistParam.iCols == 48)
     403    {
     404      rcDistParam.DistFunc = m_afpDistortFunc[48 ];
     405    }
     406#endif
    465407  }
    466408  else
     
    478420
    479421Void
     422#if NS_HAD
     423TComRdCost::setDistParam( DistParam& rcDP, Pel* p1, Int iStride1, Pel* p2, Int iStride2, Int iWidth, Int iHeight, Bool bHadamard, Bool bUseNSHAD )
     424#else
    480425TComRdCost::setDistParam( DistParam& rcDP, Pel* p1, Int iStride1, Pel* p2, Int iStride2, Int iWidth, Int iHeight, Bool bHadamard )
     426#endif
    481427{
    482428  rcDP.pOrg       = p1;
     
    493439  rcDP.iStrideUsed = 0;
    494440#endif
     441#if NS_HAD
     442  rcDP.bUseNSHAD  = bUseNSHAD;
     443#endif
    495444}
    496445
     
    561510#endif
    562511
     512#if WEIGHTED_CHROMA_DISTORTION
     513UInt TComRdCost::getDistPart( Pel* piCur, Int iCurStride,  Pel* piOrg, Int iOrgStride, UInt uiBlkWidth, UInt uiBlkHeight, Bool bWeighted, DFunc eDFunc )
     514#else
    563515UInt TComRdCost::getDistPart( Pel* piCur, Int iCurStride,  Pel* piOrg, Int iOrgStride, UInt uiBlkWidth, UInt uiBlkHeight, DFunc eDFunc )
     516#endif
    564517{
    565518  DistParam cDtParam;
     
    569522  cDtParam.iStrideOrg = iOrgStride;
    570523  cDtParam.iStrideCur = iCurStride;
    571 #ifdef DCM_RDCOST_TEMP_FIX //Temporary fix since DistParam is lacking a constructor and the variable iStep is not initialized
    572524  cDtParam.iStep      = 1;
    573 #endif
    574 #ifdef WEIGHT_PRED
    575   cDtParam.applyWeight  = false;
     525
     526  cDtParam.bApplyWeight = false;
    576527  cDtParam.uiComp       = 255;    // just for assert: to be sure it was set before use, since only values 0,1 or 2 are allowed.
    577 #endif
     528
     529#if WEIGHTED_CHROMA_DISTORTION
     530  if (bWeighted)
     531  {
     532    return ((int) (m_chromaDistortionWeight * cDtParam.DistFunc( &cDtParam )));
     533  }
     534  else
     535  {
     536    return cDtParam.DistFunc( &cDtParam );
     537  }
     538#else
    578539  return cDtParam.DistFunc( &cDtParam );
    579 }
     540#endif
     541}
     542
     543
    580544
    581545// ====================================================================================================================
     
    587551// --------------------------------------------------------------------------------------------------------------------
    588552
    589 #ifdef ROUNDING_CONTROL_BIPRED
    590 UInt TComRdCost::xGetSAD( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    591 {
    592   Pel* piOrg   = pcDtParam->pOrg;
    593   Pel* piCur   = pcDtParam->pCur;
    594   Pel* piRef   = pRefY;
    595   Int  iRows   = pcDtParam->iRows;
    596   Int  iCols   = pcDtParam->iCols;
    597   Int  iStrideCur = pcDtParam->iStrideCur;
    598   Int  iStrideOrg = pcDtParam->iStrideOrg;
    599   Pel  pred;
    600  
    601   UInt uiSum = 0;
    602  
    603   for( ; iRows != 0; iRows-- )
    604   {
    605     for (Int n = 0; n < iCols; n++ )
    606     {
    607       pred = (piCur[n] + piRef[n] + bRound) >> 1 ;
    608       uiSum += abs( piOrg[n] - pred );
    609     }
    610     piOrg += iStrideOrg;
    611     piCur += iStrideCur;
    612     piRef += iCols;
    613   }
    614  
    615   return ( uiSum >> g_uiBitIncrement );
    616 }
    617 
    618 UInt TComRdCost::xGetSAD4( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    619 {
    620   Pel* piOrg   = pcDtParam->pOrg;
    621   Pel* piCur   = pcDtParam->pCur;
    622   Pel* piRef   = pRefY;
    623   Int  iRows   = pcDtParam->iRows;
    624   Int  iSubShift  = pcDtParam->iSubShift;
    625   Int  iSubStep   = ( 1 << iSubShift );
    626   Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
    627   Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
    628   Int  iStrideRef = pcDtParam->iCols*iSubStep;
    629   Pel  pred;
    630  
    631   UInt uiSum = 0;
    632  
    633   for( ; iRows != 0; iRows-=iSubStep )
    634   {
    635     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;
    636     uiSum += abs( piOrg[0] - pred );
    637     pred = (piCur[1] + piRef[1] + bRound) >> 1 ;
    638     uiSum += abs( piOrg[1] - pred );
    639     pred = (piCur[2] + piRef[2] + bRound) >> 1 ;
    640     uiSum += abs( piOrg[2] - pred );
    641     pred = (piCur[3] + piRef[3] + bRound) >> 1 ;
    642     uiSum += abs( piOrg[3] - pred );
    643    
    644     piOrg += iStrideOrg;
    645     piCur += iStrideCur;
    646     piRef += iStrideRef;
    647   }
    648  
    649   uiSum <<= iSubShift;
    650   return ( uiSum >> g_uiBitIncrement );
    651 }
    652 
    653 UInt TComRdCost::xGetSAD8( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    654 {
    655   Pel* piOrg      = pcDtParam->pOrg;
    656   Pel* piCur      = pcDtParam->pCur;
    657   Pel* piRef      = pRefY;
    658   Int  iRows      = pcDtParam->iRows;
    659   Int  iSubShift  = pcDtParam->iSubShift;
    660   Int  iSubStep   = ( 1 << iSubShift );
    661   Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
    662   Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
    663   Int  iStrideRef = pcDtParam->iCols*iSubStep;
    664   Pel  pred;
    665  
    666   UInt uiSum = 0;
    667  
    668   for( ; iRows != 0; iRows-=iSubStep )
    669   {
    670     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;
    671     uiSum += abs( piOrg[0] - pred );
    672     pred = (piCur[1] + piRef[1] + bRound) >> 1 ;
    673     uiSum += abs( piOrg[1] - pred );
    674     pred = (piCur[2] + piRef[2] + bRound) >> 1 ;
    675     uiSum += abs( piOrg[2] - pred );
    676     pred = (piCur[3] + piRef[3] + bRound) >> 1 ;
    677     uiSum += abs( piOrg[3] - pred );
    678     pred = (piCur[4] + piRef[4] + bRound) >> 1 ;
    679     uiSum += abs( piOrg[4] - pred );
    680     pred = (piCur[5] + piRef[5] + bRound) >> 1 ;
    681     uiSum += abs( piOrg[5] - pred );
    682     pred = (piCur[6] + piRef[6] + bRound) >> 1 ;
    683     uiSum += abs( piOrg[6] - pred );
    684     pred = (piCur[7] + piRef[7] + bRound) >> 1 ;
    685     uiSum += abs( piOrg[7] - pred );
    686    
    687     piOrg += iStrideOrg;
    688     piCur += iStrideCur;
    689     piRef += iStrideRef;
    690   }
    691  
    692   uiSum <<= iSubShift;
    693   return ( uiSum >> g_uiBitIncrement );
    694 }
    695 
    696 UInt TComRdCost::xGetSAD16( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    697 {
    698   Pel* piOrg   = pcDtParam->pOrg;
    699   Pel* piCur   = pcDtParam->pCur;
    700   Pel* piRef   = pRefY;
    701   Int  iRows   = pcDtParam->iRows;
    702   Int  iSubShift  = pcDtParam->iSubShift;
    703   Int  iSubStep   = ( 1 << iSubShift );
    704   Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
    705   Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
    706   Int  iStrideRef = pcDtParam->iCols*iSubStep;
    707   Pel  pred;
    708  
    709   UInt uiSum = 0;
    710  
    711   for( ; iRows != 0; iRows-=iSubStep )
    712   {
    713     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;
    714     uiSum += abs( piOrg[0] - pred );
    715     pred = (piCur[1] + piRef[1] + bRound) >> 1 ;
    716     uiSum += abs( piOrg[1] - pred );
    717     pred = (piCur[2] + piRef[2] + bRound) >> 1 ;
    718     uiSum += abs( piOrg[2] - pred );
    719     pred = (piCur[3] + piRef[3] + bRound) >> 1 ;
    720     uiSum += abs( piOrg[3] - pred );
    721     pred = (piCur[4] + piRef[4] + bRound) >> 1 ;
    722     uiSum += abs( piOrg[4] - pred );
    723     pred = (piCur[5] + piRef[5] + bRound) >> 1 ;
    724     uiSum += abs( piOrg[5] - pred );
    725     pred = (piCur[6] + piRef[6] + bRound) >> 1 ;
    726     uiSum += abs( piOrg[6] - pred );
    727     pred = (piCur[7] + piRef[7] + bRound) >> 1 ;
    728     uiSum += abs( piOrg[7] - pred );
    729     pred = (piCur[8] + piRef[8] + bRound) >> 1 ;
    730     uiSum += abs( piOrg[8] - pred );
    731     pred = (piCur[9] + piRef[9] + bRound) >> 1 ;
    732     uiSum += abs( piOrg[9] - pred );
    733     pred = (piCur[10] + piRef[10] + bRound) >> 1 ;
    734     uiSum += abs( piOrg[10] - pred );
    735     pred = (piCur[11] + piRef[11] + bRound) >> 1 ;
    736     uiSum += abs( piOrg[11] - pred );
    737     pred = (piCur[12] + piRef[12] + bRound) >> 1 ;
    738     uiSum += abs( piOrg[12] - pred );
    739     pred = (piCur[13] + piRef[13] + bRound) >> 1 ;
    740     uiSum += abs( piOrg[13] - pred );
    741     pred = (piCur[14] + piRef[14] + bRound) >> 1 ;
    742     uiSum += abs( piOrg[14] - pred );
    743     pred = (piCur[15] + piRef[15] + bRound) >> 1 ;
    744     uiSum += abs( piOrg[15] - pred );
    745    
    746     piOrg += iStrideOrg;
    747     piCur += iStrideCur;
    748     piRef += iStrideRef;
    749   }
    750  
    751   uiSum <<= iSubShift;
    752   return ( uiSum >> g_uiBitIncrement );
    753 }
    754 
    755 UInt TComRdCost::xGetSAD16N( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    756 {
    757   Pel* piOrg   = pcDtParam->pOrg;
    758   Pel* piCur   = pcDtParam->pCur;
    759   Pel* piRef   = pRefY;
    760   Int  iRows   = pcDtParam->iRows;
    761   Int  iCols   = pcDtParam->iCols;
    762   Int  iSubShift  = pcDtParam->iSubShift;
    763   Int  iSubStep   = ( 1 << iSubShift );
    764   Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
    765   Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
    766   Int  iStrideRef = iCols*iSubStep;
    767   Pel  pred;
    768  
    769   UInt uiSum = 0;
    770  
    771   for( ; iRows != 0; iRows-=iSubStep )
    772   {
    773     for (Int n = 0; n < iCols; n+=16 )
    774     {
    775      
    776       pred = (piCur[n+ 0] + piRef[n+ 0] + bRound) >> 1 ;
    777       uiSum += abs( piOrg[n+ 0] - pred );
    778       pred = (piCur[n+ 1] + piRef[n+ 1] + bRound) >> 1 ;
    779       uiSum += abs( piOrg[n+ 1] - pred );
    780       pred = (piCur[n+ 2] + piRef[n+ 2] + bRound) >> 1 ;
    781       uiSum += abs( piOrg[n+ 2] - pred );
    782       pred = (piCur[n+ 3] + piRef[n+ 3] + bRound) >> 1 ;
    783       uiSum += abs( piOrg[n+ 3] - pred );
    784       pred = (piCur[n+ 4] + piRef[n+ 4] + bRound) >> 1 ;
    785       uiSum += abs( piOrg[n+ 4] - pred );
    786       pred = (piCur[n+ 5] + piRef[n+ 5] + bRound) >> 1 ;
    787       uiSum += abs( piOrg[n+ 5] - pred );
    788       pred = (piCur[n+ 6] + piRef[n+ 6] + bRound) >> 1 ;
    789       uiSum += abs( piOrg[n+ 6] - pred );
    790       pred = (piCur[n+ 7] + piRef[n+ 7] + bRound) >> 1 ;
    791       uiSum += abs( piOrg[n+ 7] - pred );
    792       pred = (piCur[n+ 8] + piRef[n+ 8] + bRound) >> 1 ;
    793       uiSum += abs( piOrg[n+ 8] - pred );
    794       pred = (piCur[n+ 9] + piRef[n+ 9] + bRound) >> 1 ;
    795       uiSum += abs( piOrg[n+ 9] - pred );
    796       pred = (piCur[n+ 10] + piRef[n+ 10] + bRound) >> 1 ;
    797       uiSum += abs( piOrg[n+ 10] - pred );
    798       pred = (piCur[n+ 11] + piRef[n+ 11] + bRound) >> 1 ;
    799       uiSum += abs( piOrg[n+ 11] - pred );
    800       pred = (piCur[n+ 12] + piRef[n+ 12] + bRound) >> 1 ;
    801       uiSum += abs( piOrg[n+ 12] - pred );
    802       pred = (piCur[n+ 13] + piRef[n+ 13] + bRound) >> 1 ;
    803       uiSum += abs( piOrg[n+ 13] - pred );
    804       pred = (piCur[n+ 14] + piRef[n+ 14] + bRound) >> 1 ;
    805       uiSum += abs( piOrg[n+ 14] - pred );
    806       pred = (piCur[n+ 15] + piRef[n+ 15] + bRound) >> 1 ;
    807       uiSum += abs( piOrg[n+ 15] - pred );
    808      
    809     }
    810     piOrg += iStrideOrg;
    811     piCur += iStrideCur;
    812     piRef += iStrideRef;
    813   }
    814  
    815   uiSum <<= iSubShift;
    816   return ( uiSum >> g_uiBitIncrement );
    817 }
    818 
    819 UInt TComRdCost::xGetSAD32( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    820 {
    821   Pel* piOrg   = pcDtParam->pOrg;
    822   Pel* piCur   = pcDtParam->pCur;
    823   Pel* piRef   = pRefY;
    824   Int  iRows   = pcDtParam->iRows;
    825   Int  iSubShift  = pcDtParam->iSubShift;
    826   Int  iSubStep   = ( 1 << iSubShift );
    827   Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
    828   Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
    829   Int  iStrideRef = pcDtParam->iCols*iSubStep;
    830   Pel  pred;
    831  
    832   UInt uiSum = 0;
    833  
    834   for( ; iRows != 0; iRows-=iSubStep )
    835   {
    836     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;
    837     uiSum += abs( piOrg[0] - pred );
    838     pred = (piCur[1] + piRef[1] + bRound) >> 1 ;
    839     uiSum += abs( piOrg[1] - pred );
    840     pred = (piCur[2] + piRef[2] + bRound) >> 1 ;
    841     uiSum += abs( piOrg[2] - pred );
    842     pred = (piCur[3] + piRef[3] + bRound) >> 1 ;
    843     uiSum += abs( piOrg[3] - pred );
    844     pred = (piCur[4] + piRef[4] + bRound) >> 1 ;
    845     uiSum += abs( piOrg[4] - pred );
    846     pred = (piCur[5] + piRef[5] + bRound) >> 1 ;
    847     uiSum += abs( piOrg[5] - pred );
    848     pred = (piCur[6] + piRef[6] + bRound) >> 1 ;
    849     uiSum += abs( piOrg[6] - pred );
    850     pred = (piCur[7] + piRef[7] + bRound) >> 1 ;
    851     uiSum += abs( piOrg[7] - pred );
    852     pred = (piCur[8] + piRef[8] + bRound) >> 1 ;
    853     uiSum += abs( piOrg[8] - pred );
    854     pred = (piCur[9] + piRef[9] + bRound) >> 1 ;
    855     uiSum += abs( piOrg[9] - pred );
    856    
    857     pred = (piCur[10] + piRef[10] + bRound) >> 1 ;
    858     uiSum += abs( piOrg[10] - pred );
    859     pred = (piCur[11] + piRef[11] + bRound) >> 1 ;
    860     uiSum += abs( piOrg[11] - pred );
    861     pred = (piCur[12] + piRef[12] + bRound) >> 1 ;
    862     uiSum += abs( piOrg[12] - pred );
    863     pred = (piCur[13] + piRef[13] + bRound) >> 1 ;
    864     uiSum += abs( piOrg[13] - pred );
    865     pred = (piCur[14] + piRef[14] + bRound) >> 1 ;
    866     uiSum += abs( piOrg[14] - pred );
    867     pred = (piCur[15] + piRef[15] + bRound) >> 1 ;
    868     uiSum += abs( piOrg[15] - pred );
    869     pred = (piCur[16] + piRef[16] + bRound) >> 1 ;
    870     uiSum += abs( piOrg[16] - pred );
    871     pred = (piCur[17] + piRef[17] + bRound) >> 1 ;
    872     uiSum += abs( piOrg[17] - pred );
    873     pred = (piCur[18] + piRef[18] + bRound) >> 1 ;
    874     uiSum += abs( piOrg[18] - pred );
    875     pred = (piCur[19] + piRef[19] + bRound) >> 1 ;
    876     uiSum += abs( piOrg[19] - pred );
    877    
    878     pred = (piCur[20] + piRef[20] + bRound) >> 1 ;
    879     uiSum += abs( piOrg[20] - pred );
    880     pred = (piCur[21] + piRef[21] + bRound) >> 1 ;
    881     uiSum += abs( piOrg[21] - pred );
    882     pred = (piCur[22] + piRef[22] + bRound) >> 1 ;
    883     uiSum += abs( piOrg[22] - pred );
    884     pred = (piCur[23] + piRef[23] + bRound) >> 1 ;
    885     uiSum += abs( piOrg[23] - pred );
    886     pred = (piCur[24] + piRef[24] + bRound) >> 1 ;
    887     uiSum += abs( piOrg[24] - pred );
    888     pred = (piCur[25] + piRef[25] + bRound) >> 1 ;
    889     uiSum += abs( piOrg[25] - pred );
    890     pred = (piCur[26] + piRef[26] + bRound) >> 1 ;
    891     uiSum += abs( piOrg[26] - pred );
    892     pred = (piCur[27] + piRef[27] + bRound) >> 1 ;
    893     uiSum += abs( piOrg[27] - pred );
    894     pred = (piCur[28] + piRef[28] + bRound) >> 1 ;
    895     uiSum += abs( piOrg[28] - pred );
    896     pred = (piCur[29] + piRef[29] + bRound) >> 1 ;
    897     uiSum += abs( piOrg[29] - pred );
    898    
    899     pred = (piCur[30] + piRef[30] + bRound) >> 1 ;
    900     uiSum += abs( piOrg[30] - pred );
    901     pred = (piCur[31] + piRef[31] + bRound) >> 1 ;
    902     uiSum += abs( piOrg[31] - pred );
    903    
    904     piOrg += iStrideOrg;
    905     piCur += iStrideCur;
    906     piRef += iStrideRef;
    907   }
    908  
    909   uiSum <<= iSubShift;
    910   return ( uiSum >> g_uiBitIncrement );
    911 }
    912 
    913 UInt TComRdCost::xGetSAD64( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    914 {
    915   Pel* piOrg   = pcDtParam->pOrg;
    916   Pel* piCur   = pcDtParam->pCur;
    917   Pel* piRef   = pRefY;
    918   Int  iRows   = pcDtParam->iRows;
    919   Int  iSubShift  = pcDtParam->iSubShift;
    920   Int  iSubStep   = ( 1 << iSubShift );
    921   Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
    922   Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
    923   Int  iStrideRef = pcDtParam->iCols*iSubStep;
    924   Pel  pred;
    925  
    926   UInt uiSum = 0;
    927  
    928   for( ; iRows != 0; iRows-=iSubStep )
    929   {
    930    
    931     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;
    932     uiSum += abs( piOrg[0] - pred );
    933     pred = (piCur[1] + piRef[1] + bRound) >> 1 ;
    934     uiSum += abs( piOrg[1] - pred );
    935     pred = (piCur[2] + piRef[2] + bRound) >> 1 ;
    936     uiSum += abs( piOrg[2] - pred );
    937     pred = (piCur[3] + piRef[3] + bRound) >> 1 ;
    938     uiSum += abs( piOrg[3] - pred );
    939     pred = (piCur[4] + piRef[4] + bRound) >> 1 ;
    940     uiSum += abs( piOrg[4] - pred );
    941     pred = (piCur[5] + piRef[5] + bRound) >> 1 ;
    942     uiSum += abs( piOrg[5] - pred );
    943     pred = (piCur[6] + piRef[6] + bRound) >> 1 ;
    944     uiSum += abs( piOrg[6] - pred );
    945     pred = (piCur[7] + piRef[7] + bRound) >> 1 ;
    946     uiSum += abs( piOrg[7] - pred );
    947     pred = (piCur[8] + piRef[8] + bRound) >> 1 ;
    948     uiSum += abs( piOrg[8] - pred );
    949     pred = (piCur[9] + piRef[9] + bRound) >> 1 ;
    950     uiSum += abs( piOrg[9] - pred );
    951    
    952     pred = (piCur[10] + piRef[10] + bRound) >> 1 ;
    953     uiSum += abs( piOrg[10] - pred );
    954     pred = (piCur[11] + piRef[11] + bRound) >> 1 ;
    955     uiSum += abs( piOrg[11] - pred );
    956     pred = (piCur[12] + piRef[12] + bRound) >> 1 ;
    957     uiSum += abs( piOrg[12] - pred );
    958     pred = (piCur[13] + piRef[13] + bRound) >> 1 ;
    959     uiSum += abs( piOrg[13] - pred );
    960     pred = (piCur[14] + piRef[14] + bRound) >> 1 ;
    961     uiSum += abs( piOrg[14] - pred );
    962     pred = (piCur[15] + piRef[15] + bRound) >> 1 ;
    963     uiSum += abs( piOrg[15] - pred );
    964     pred = (piCur[16] + piRef[16] + bRound) >> 1 ;
    965     uiSum += abs( piOrg[16] - pred );
    966     pred = (piCur[17] + piRef[17] + bRound) >> 1 ;
    967     uiSum += abs( piOrg[17] - pred );
    968     pred = (piCur[18] + piRef[18] + bRound) >> 1 ;
    969     uiSum += abs( piOrg[18] - pred );
    970     pred = (piCur[19] + piRef[19] + bRound) >> 1 ;
    971     uiSum += abs( piOrg[19] - pred );
    972    
    973     pred = (piCur[20] + piRef[20] + bRound) >> 1 ;
    974     uiSum += abs( piOrg[20] - pred );
    975     pred = (piCur[21] + piRef[21] + bRound) >> 1 ;
    976     uiSum += abs( piOrg[21] - pred );
    977     pred = (piCur[22] + piRef[22] + bRound) >> 1 ;
    978     uiSum += abs( piOrg[22] - pred );
    979     pred = (piCur[23] + piRef[23] + bRound) >> 1 ;
    980     uiSum += abs( piOrg[23] - pred );
    981     pred = (piCur[24] + piRef[24] + bRound) >> 1 ;
    982     uiSum += abs( piOrg[24] - pred );
    983     pred = (piCur[25] + piRef[25] + bRound) >> 1 ;
    984     uiSum += abs( piOrg[25] - pred );
    985     pred = (piCur[26] + piRef[26] + bRound) >> 1 ;
    986     uiSum += abs( piOrg[26] - pred );
    987     pred = (piCur[27] + piRef[27] + bRound) >> 1 ;
    988     uiSum += abs( piOrg[27] - pred );
    989     pred = (piCur[28] + piRef[28] + bRound) >> 1 ;
    990     uiSum += abs( piOrg[28] - pred );
    991     pred = (piCur[29] + piRef[29] + bRound) >> 1 ;
    992     uiSum += abs( piOrg[29] - pred );
    993    
    994     pred = (piCur[30] + piRef[30] + bRound) >> 1 ;
    995     uiSum += abs( piOrg[30] - pred );
    996     pred = (piCur[31] + piRef[31] + bRound) >> 1 ;
    997     uiSum += abs( piOrg[31] - pred );
    998     pred = (piCur[32] + piRef[32] + bRound) >> 1 ;
    999     uiSum += abs( piOrg[32] - pred );
    1000     pred = (piCur[33] + piRef[33] + bRound) >> 1 ;
    1001     uiSum += abs( piOrg[33] - pred );
    1002     pred = (piCur[34] + piRef[34] + bRound) >> 1 ;
    1003     uiSum += abs( piOrg[34] - pred );
    1004     pred = (piCur[35] + piRef[35] + bRound) >> 1 ;
    1005     uiSum += abs( piOrg[35] - pred );
    1006     pred = (piCur[36] + piRef[36] + bRound) >> 1 ;
    1007     uiSum += abs( piOrg[36] - pred );
    1008     pred = (piCur[37] + piRef[37] + bRound) >> 1 ;
    1009     uiSum += abs( piOrg[37] - pred );
    1010     pred = (piCur[38] + piRef[38] + bRound) >> 1 ;
    1011     uiSum += abs( piOrg[38] - pred );
    1012     pred = (piCur[39] + piRef[39] + bRound) >> 1 ;
    1013     uiSum += abs( piOrg[39] - pred );
    1014    
    1015     pred = (piCur[40] + piRef[40] + bRound) >> 1 ;
    1016     uiSum += abs( piOrg[40] - pred );
    1017     pred = (piCur[41] + piRef[41] + bRound) >> 1 ;
    1018     uiSum += abs( piOrg[41] - pred );
    1019     pred = (piCur[42] + piRef[42] + bRound) >> 1 ;
    1020     uiSum += abs( piOrg[42] - pred );
    1021     pred = (piCur[43] + piRef[43] + bRound) >> 1 ;
    1022     uiSum += abs( piOrg[43] - pred );
    1023     pred = (piCur[44] + piRef[44] + bRound) >> 1 ;
    1024     uiSum += abs( piOrg[44] - pred );
    1025     pred = (piCur[45] + piRef[45] + bRound) >> 1 ;
    1026     uiSum += abs( piOrg[45] - pred );
    1027     pred = (piCur[46] + piRef[46] + bRound) >> 1 ;
    1028     uiSum += abs( piOrg[46] - pred );
    1029     pred = (piCur[47] + piRef[47] + bRound) >> 1 ;
    1030     uiSum += abs( piOrg[47] - pred );
    1031     pred = (piCur[48] + piRef[48] + bRound) >> 1 ;
    1032     uiSum += abs( piOrg[48] - pred );
    1033     pred = (piCur[49] + piRef[49] + bRound) >> 1 ;
    1034     uiSum += abs( piOrg[49] - pred );
    1035    
    1036     pred = (piCur[50] + piRef[50] + bRound) >> 1 ;
    1037     uiSum += abs( piOrg[50] - pred );
    1038     pred = (piCur[51] + piRef[51] + bRound) >> 1 ;
    1039     uiSum += abs( piOrg[51] - pred );
    1040     pred = (piCur[52] + piRef[52] + bRound) >> 1 ;
    1041     uiSum += abs( piOrg[52] - pred );
    1042     pred = (piCur[53] + piRef[53] + bRound) >> 1 ;
    1043     uiSum += abs( piOrg[53] - pred );
    1044     pred = (piCur[54] + piRef[54] + bRound) >> 1 ;
    1045     uiSum += abs( piOrg[54] - pred );
    1046     pred = (piCur[55] + piRef[55] + bRound) >> 1 ;
    1047     uiSum += abs( piOrg[55] - pred );
    1048     pred = (piCur[56] + piRef[56] + bRound) >> 1 ;
    1049     uiSum += abs( piOrg[56] - pred );
    1050     pred = (piCur[57] + piRef[57] + bRound) >> 1 ;
    1051     uiSum += abs( piOrg[57] - pred );
    1052     pred = (piCur[58] + piRef[58] + bRound) >> 1 ;
    1053     uiSum += abs( piOrg[58] - pred );
    1054     pred = (piCur[59] + piRef[59] + bRound) >> 1 ;
    1055     uiSum += abs( piOrg[59] - pred );
    1056    
    1057     pred = (piCur[60] + piRef[60] + bRound) >> 1 ;
    1058     uiSum += abs( piOrg[60] - pred );
    1059     pred = (piCur[61] + piRef[61] + bRound) >> 1 ;
    1060     uiSum += abs( piOrg[61] - pred );
    1061     pred = (piCur[62] + piRef[62] + bRound) >> 1 ;
    1062     uiSum += abs( piOrg[62] - pred );
    1063     pred = (piCur[63] + piRef[63] + bRound) >> 1 ;
    1064     uiSum += abs( piOrg[63] - pred );
    1065    
    1066     piOrg += iStrideOrg;
    1067     piCur += iStrideCur;
    1068     piRef += iStrideRef;
    1069   }
    1070  
    1071   uiSum <<= iSubShift;
    1072   return ( uiSum >> g_uiBitIncrement );
    1073 }
    1074 #endif
    1075 
    1076553UInt TComRdCost::xGetSAD( DistParam* pcDtParam )
    1077554{
    1078 #ifdef WEIGHT_PRED
    1079   if ( pcDtParam->applyWeight )
    1080   {
    1081     assert(pcDtParam->iSubShift==0);
     555  if ( pcDtParam->bApplyWeight )
     556  {
    1082557    return xGetSADw( pcDtParam );
    1083558  }
    1084 #endif
    1085559  Pel* piOrg   = pcDtParam->pOrg;
    1086560  Pel* piCur   = pcDtParam->pCur;
     
    1107581UInt TComRdCost::xGetSAD4( DistParam* pcDtParam )
    1108582{
    1109 #ifdef WEIGHT_PRED
    1110   if ( pcDtParam->applyWeight )
     583  if ( pcDtParam->bApplyWeight )
    1111584  {
    1112585    return xGetSADw( pcDtParam );
    1113586  }
    1114 #endif
    1115587  Pel* piOrg   = pcDtParam->pOrg;
    1116588  Pel* piCur   = pcDtParam->pCur;
     
    1140612UInt TComRdCost::xGetSAD8( DistParam* pcDtParam )
    1141613{
    1142 #ifdef WEIGHT_PRED
    1143   if ( pcDtParam->applyWeight )
     614  if ( pcDtParam->bApplyWeight )
    1144615  {
    1145616    return xGetSADw( pcDtParam );
    1146617  }
    1147 #endif
    1148618  Pel* piOrg      = pcDtParam->pOrg;
    1149619  Pel* piCur      = pcDtParam->pCur;
     
    1177647UInt TComRdCost::xGetSAD16( DistParam* pcDtParam )
    1178648{
    1179 #ifdef WEIGHT_PRED
    1180   if ( pcDtParam->applyWeight )
     649  if ( pcDtParam->bApplyWeight )
    1181650  {
    1182651    return xGetSADw( pcDtParam );
    1183652  }
    1184 #endif
    1185653  Pel* piOrg   = pcDtParam->pOrg;
    1186654  Pel* piCur   = pcDtParam->pCur;
     
    1220688}
    1221689
     690#if AMP_SAD
     691UInt TComRdCost::xGetSAD12( DistParam* pcDtParam )
     692{
     693  if ( pcDtParam->bApplyWeight )
     694  {
     695    return xGetSADw( pcDtParam );
     696  }
     697  Pel* piOrg   = pcDtParam->pOrg;
     698  Pel* piCur   = pcDtParam->pCur;
     699  Int  iRows   = pcDtParam->iRows;
     700  Int  iSubShift  = pcDtParam->iSubShift;
     701  Int  iSubStep   = ( 1 << iSubShift );
     702  Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
     703  Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
     704 
     705  UInt uiSum = 0;
     706 
     707  for( ; iRows != 0; iRows-=iSubStep )
     708  {
     709    uiSum += abs( piOrg[0] - piCur[0] );
     710    uiSum += abs( piOrg[1] - piCur[1] );
     711    uiSum += abs( piOrg[2] - piCur[2] );
     712    uiSum += abs( piOrg[3] - piCur[3] );
     713    uiSum += abs( piOrg[4] - piCur[4] );
     714    uiSum += abs( piOrg[5] - piCur[5] );
     715    uiSum += abs( piOrg[6] - piCur[6] );
     716    uiSum += abs( piOrg[7] - piCur[7] );
     717    uiSum += abs( piOrg[8] - piCur[8] );
     718    uiSum += abs( piOrg[9] - piCur[9] );
     719    uiSum += abs( piOrg[10] - piCur[10] );
     720    uiSum += abs( piOrg[11] - piCur[11] );
     721   
     722    piOrg += iStrideOrg;
     723    piCur += iStrideCur;
     724  }
     725 
     726  uiSum <<= iSubShift;
     727  return ( uiSum >> g_uiBitIncrement );
     728}
     729#endif
     730
    1222731UInt TComRdCost::xGetSAD16N( DistParam* pcDtParam )
    1223732{
    1224 #ifdef WEIGHT_PRED
    1225   if ( pcDtParam->applyWeight )
    1226   {
    1227     return xGetSAD16Nw( pcDtParam );
    1228   }
    1229 #endif
    1230733  Pel* piOrg   = pcDtParam->pOrg;
    1231734  Pel* piCur   = pcDtParam->pCur;
     
    1270773UInt TComRdCost::xGetSAD32( DistParam* pcDtParam )
    1271774{
    1272 #ifdef WEIGHT_PRED
    1273   if ( pcDtParam->applyWeight )
     775  if ( pcDtParam->bApplyWeight )
    1274776  {
    1275777    return xGetSADw( pcDtParam );
    1276778  }
    1277 #endif
    1278779  Pel* piOrg   = pcDtParam->pOrg;
    1279780  Pel* piCur   = pcDtParam->pCur;
     
    1329830}
    1330831
     832#if AMP_SAD
     833UInt TComRdCost::xGetSAD24( DistParam* pcDtParam )
     834{
     835  if ( pcDtParam->bApplyWeight )
     836  {
     837    return xGetSADw( pcDtParam );
     838  }
     839  Pel* piOrg   = pcDtParam->pOrg;
     840  Pel* piCur   = pcDtParam->pCur;
     841  Int  iRows   = pcDtParam->iRows;
     842  Int  iSubShift  = pcDtParam->iSubShift;
     843  Int  iSubStep   = ( 1 << iSubShift );
     844  Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
     845  Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
     846 
     847  UInt uiSum = 0;
     848 
     849  for( ; iRows != 0; iRows-=iSubStep )
     850  {
     851    uiSum += abs( piOrg[0] - piCur[0] );
     852    uiSum += abs( piOrg[1] - piCur[1] );
     853    uiSum += abs( piOrg[2] - piCur[2] );
     854    uiSum += abs( piOrg[3] - piCur[3] );
     855    uiSum += abs( piOrg[4] - piCur[4] );
     856    uiSum += abs( piOrg[5] - piCur[5] );
     857    uiSum += abs( piOrg[6] - piCur[6] );
     858    uiSum += abs( piOrg[7] - piCur[7] );
     859    uiSum += abs( piOrg[8] - piCur[8] );
     860    uiSum += abs( piOrg[9] - piCur[9] );
     861    uiSum += abs( piOrg[10] - piCur[10] );
     862    uiSum += abs( piOrg[11] - piCur[11] );
     863    uiSum += abs( piOrg[12] - piCur[12] );
     864    uiSum += abs( piOrg[13] - piCur[13] );
     865    uiSum += abs( piOrg[14] - piCur[14] );
     866    uiSum += abs( piOrg[15] - piCur[15] );
     867    uiSum += abs( piOrg[16] - piCur[16] );
     868    uiSum += abs( piOrg[17] - piCur[17] );
     869    uiSum += abs( piOrg[18] - piCur[18] );
     870    uiSum += abs( piOrg[19] - piCur[19] );
     871    uiSum += abs( piOrg[20] - piCur[20] );
     872    uiSum += abs( piOrg[21] - piCur[21] );
     873    uiSum += abs( piOrg[22] - piCur[22] );
     874    uiSum += abs( piOrg[23] - piCur[23] );
     875   
     876    piOrg += iStrideOrg;
     877    piCur += iStrideCur;
     878  }
     879 
     880  uiSum <<= iSubShift;
     881  return ( uiSum >> g_uiBitIncrement );
     882}
     883
     884#endif
     885
    1331886UInt TComRdCost::xGetSAD64( DistParam* pcDtParam )
    1332887{
    1333 #ifdef WEIGHT_PRED
    1334   if ( pcDtParam->applyWeight )
     888  if ( pcDtParam->bApplyWeight )
    1335889  {
    1336890    return xGetSADw( pcDtParam );
    1337891  }
    1338 #endif
    1339892  Pel* piOrg   = pcDtParam->pOrg;
    1340893  Pel* piCur   = pcDtParam->pCur;
     
    1422975}
    1423976
    1424 // --------------------------------------------------------------------------------------------------------------------
    1425 // SAD with step (used in fractional search)
    1426 // --------------------------------------------------------------------------------------------------------------------
    1427 
    1428 #ifdef ROUNDING_CONTROL_BIPRED
    1429 UInt TComRdCost::xGetSADs( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    1430 {
     977#if AMP_SAD
     978UInt TComRdCost::xGetSAD48( DistParam* pcDtParam )
     979{
     980  if ( pcDtParam->bApplyWeight )
     981  {
     982    return xGetSADw( pcDtParam );
     983  }
    1431984  Pel* piOrg   = pcDtParam->pOrg;
    1432985  Pel* piCur   = pcDtParam->pCur;
    1433   Pel* piRef   = pRefY;
    1434986  Int  iRows   = pcDtParam->iRows;
    1435   Int  iCols   = pcDtParam->iCols;
    1436   Int  iStrideCur = pcDtParam->iStrideCur;
    1437   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1438   Int  iStep  = pcDtParam->iStep;
    1439   Pel  pred;
     987  Int  iSubShift  = pcDtParam->iSubShift;
     988  Int  iSubStep   = ( 1 << iSubShift );
     989  Int  iStrideCur = pcDtParam->iStrideCur*iSubStep;
     990  Int  iStrideOrg = pcDtParam->iStrideOrg*iSubStep;
    1440991 
    1441992  UInt uiSum = 0;
    1442993 
    1443   for( ; iRows != 0; iRows-- )
    1444   {
    1445     for (Int n = 0; n < iCols; n++ )
    1446     {
    1447       pred = (piCur[n*iStep] + piRef[n] + bRound) >> 1 ;
    1448       uiSum += abs( piOrg[n] - pred );
    1449     }
    1450     piOrg += iStrideOrg;
    1451     piCur += iStrideCur;
    1452     piRef += iCols;
    1453   }
    1454  
    1455   return ( uiSum >> g_uiBitIncrement );
    1456 }
    1457 
    1458 UInt TComRdCost::xGetSADs4( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    1459 {
    1460   Pel* piOrg   = pcDtParam->pOrg;
    1461   Pel* piCur   = pcDtParam->pCur;
    1462   Pel* piRef   = pRefY;
    1463   Int  iRows   = pcDtParam->iRows;
    1464   Int  iStrideCur = pcDtParam->iStrideCur;
    1465   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1466   Int  iStrideRef = pcDtParam->iCols;
    1467   Int  iStep  = pcDtParam->iStep;
    1468   Int  iStep2 = iStep<<1;
    1469   Int  iStep3 = iStep2 + iStep;
    1470   Pel  pred;
    1471  
    1472   UInt uiSum = 0;
    1473  
    1474   for( ; iRows != 0; iRows-- )
    1475   {
    1476    
    1477     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;           uiSum += abs( piOrg[0] - pred );
    1478     pred = (piCur[iStep ] + piRef[1] + bRound) >> 1 ;      uiSum += abs( piOrg[1] - pred );
    1479     pred = (piCur[iStep2] + piRef[2] + bRound) >> 1 ;      uiSum += abs( piOrg[2] - pred );
    1480     pred = (piCur[iStep3] + piRef[3] + bRound) >> 1 ;      uiSum += abs( piOrg[3] - pred );
     994  for( ; iRows != 0; iRows-=iSubStep )
     995  {
     996    uiSum += abs( piOrg[0] - piCur[0] );
     997    uiSum += abs( piOrg[1] - piCur[1] );
     998    uiSum += abs( piOrg[2] - piCur[2] );
     999    uiSum += abs( piOrg[3] - piCur[3] );
     1000    uiSum += abs( piOrg[4] - piCur[4] );
     1001    uiSum += abs( piOrg[5] - piCur[5] );
     1002    uiSum += abs( piOrg[6] - piCur[6] );
     1003    uiSum += abs( piOrg[7] - piCur[7] );
     1004    uiSum += abs( piOrg[8] - piCur[8] );
     1005    uiSum += abs( piOrg[9] - piCur[9] );
     1006    uiSum += abs( piOrg[10] - piCur[10] );
     1007    uiSum += abs( piOrg[11] - piCur[11] );
     1008    uiSum += abs( piOrg[12] - piCur[12] );
     1009    uiSum += abs( piOrg[13] - piCur[13] );
     1010    uiSum += abs( piOrg[14] - piCur[14] );
     1011    uiSum += abs( piOrg[15] - piCur[15] );
     1012    uiSum += abs( piOrg[16] - piCur[16] );
     1013    uiSum += abs( piOrg[17] - piCur[17] );
     1014    uiSum += abs( piOrg[18] - piCur[18] );
     1015    uiSum += abs( piOrg[19] - piCur[19] );
     1016    uiSum += abs( piOrg[20] - piCur[20] );
     1017    uiSum += abs( piOrg[21] - piCur[21] );
     1018    uiSum += abs( piOrg[22] - piCur[22] );
     1019    uiSum += abs( piOrg[23] - piCur[23] );
     1020    uiSum += abs( piOrg[24] - piCur[24] );
     1021    uiSum += abs( piOrg[25] - piCur[25] );
     1022    uiSum += abs( piOrg[26] - piCur[26] );
     1023    uiSum += abs( piOrg[27] - piCur[27] );
     1024    uiSum += abs( piOrg[28] - piCur[28] );
     1025    uiSum += abs( piOrg[29] - piCur[29] );
     1026    uiSum += abs( piOrg[30] - piCur[30] );
     1027    uiSum += abs( piOrg[31] - piCur[31] );
     1028    uiSum += abs( piOrg[32] - piCur[32] );
     1029    uiSum += abs( piOrg[33] - piCur[33] );
     1030    uiSum += abs( piOrg[34] - piCur[34] );
     1031    uiSum += abs( piOrg[35] - piCur[35] );
     1032    uiSum += abs( piOrg[36] - piCur[36] );
     1033    uiSum += abs( piOrg[37] - piCur[37] );
     1034    uiSum += abs( piOrg[38] - piCur[38] );
     1035    uiSum += abs( piOrg[39] - piCur[39] );
     1036    uiSum += abs( piOrg[40] - piCur[40] );
     1037    uiSum += abs( piOrg[41] - piCur[41] );
     1038    uiSum += abs( piOrg[42] - piCur[42] );
     1039    uiSum += abs( piOrg[43] - piCur[43] );
     1040    uiSum += abs( piOrg[44] - piCur[44] );
     1041    uiSum += abs( piOrg[45] - piCur[45] );
     1042    uiSum += abs( piOrg[46] - piCur[46] );
     1043    uiSum += abs( piOrg[47] - piCur[47] );
    14811044   
    14821045    piOrg += iStrideOrg;
    14831046    piCur += iStrideCur;
    1484     piRef += iStrideRef;
    1485   }
    1486  
     1047  }
     1048 
     1049  uiSum <<= iSubShift;
    14871050  return ( uiSum >> g_uiBitIncrement );
    14881051}
    1489 
    1490 UInt TComRdCost::xGetSADs8( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    1491 {
    1492   Pel* piOrg   = pcDtParam->pOrg;
    1493   Pel* piCur   = pcDtParam->pCur;
    1494   Pel* piRef   = pRefY;
    1495   Int  iRows   = pcDtParam->iRows;
    1496   Int  iStrideCur = pcDtParam->iStrideCur;
    1497   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1498   Int  iStrideRef = pcDtParam->iCols;
    1499   Int  iStep  = pcDtParam->iStep;
    1500   Int  iStep2 = iStep<<1;
    1501   Int  iStep3 = iStep2 + iStep;
    1502   Int  iStep4 = iStep3 + iStep;
    1503   Int  iStep5 = iStep4 + iStep;
    1504   Int  iStep6 = iStep5 + iStep;
    1505   Int  iStep7 = iStep6 + iStep;
    1506   Pel  pred;
    1507  
    1508   UInt uiSum = 0;
    1509  
    1510   for( ; iRows != 0; iRows-- )
    1511   {
    1512    
    1513     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;           uiSum += abs( piOrg[0] - pred );
    1514     pred = (piCur[iStep ] + piRef[1] + bRound) >> 1 ;      uiSum += abs( piOrg[1] - pred );
    1515     pred = (piCur[iStep2] + piRef[2] + bRound) >> 1 ;      uiSum += abs( piOrg[2] - pred );
    1516     pred = (piCur[iStep3] + piRef[3] + bRound) >> 1 ;      uiSum += abs( piOrg[3] - pred );
    1517     pred = (piCur[iStep4] + piRef[4] + bRound) >> 1 ;      uiSum += abs( piOrg[4] - pred );
    1518     pred = (piCur[iStep5] + piRef[5] + bRound) >> 1 ;      uiSum += abs( piOrg[5] - pred );
    1519     pred = (piCur[iStep6] + piRef[6] + bRound) >> 1 ;      uiSum += abs( piOrg[6] - pred );
    1520     pred = (piCur[iStep7] + piRef[7] + bRound) >> 1 ;      uiSum += abs( piOrg[7] - pred );
    1521    
    1522     piOrg += iStrideOrg;
    1523     piCur += iStrideCur;
    1524     piRef += iStrideRef;
    1525   }
    1526  
    1527   return ( uiSum >> g_uiBitIncrement );
    1528 }
    1529 
    1530 UInt TComRdCost::xGetSADs16( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    1531 {
    1532   Pel* piOrg   = pcDtParam->pOrg;
    1533   Pel* piCur   = pcDtParam->pCur;
    1534   Pel* piRef   = pRefY;
    1535   Int  iRows   = pcDtParam->iRows;
    1536   Int  iStrideCur = pcDtParam->iStrideCur;
    1537   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1538   Int  iStrideRef = pcDtParam->iCols;
    1539   Int  iStep   = pcDtParam->iStep;
    1540   Int  iStep2  = iStep<<1;
    1541   Int  iStep3  = iStep2  + iStep;
    1542   Int  iStep4  = iStep3  + iStep;
    1543   Int  iStep5  = iStep4  + iStep;
    1544   Int  iStep6  = iStep5  + iStep;
    1545   Int  iStep7  = iStep6  + iStep;
    1546   Int  iStep8  = iStep7  + iStep;
    1547   Int  iStep9  = iStep8  + iStep;
    1548   Int  iStep10 = iStep9  + iStep;
    1549   Int  iStep11 = iStep10 + iStep;
    1550   Int  iStep12 = iStep11 + iStep;
    1551   Int  iStep13 = iStep12 + iStep;
    1552   Int  iStep14 = iStep13 + iStep;
    1553   Int  iStep15 = iStep14 + iStep;
    1554   Pel  pred;
    1555  
    1556   UInt uiSum = 0;
    1557  
    1558   for( ; iRows != 0; iRows-- )
    1559   {
    1560     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;           uiSum += abs( piOrg[0] - pred );
    1561     pred = (piCur[iStep ] + piRef[1] + bRound) >> 1 ;      uiSum += abs( piOrg[1] - pred );
    1562     pred = (piCur[iStep2] + piRef[2] + bRound) >> 1 ;      uiSum += abs( piOrg[2] - pred );
    1563     pred = (piCur[iStep3] + piRef[3] + bRound) >> 1 ;      uiSum += abs( piOrg[3] - pred );
    1564     pred = (piCur[iStep4] + piRef[4] + bRound) >> 1 ;      uiSum += abs( piOrg[4] - pred );
    1565     pred = (piCur[iStep5] + piRef[5] + bRound) >> 1 ;      uiSum += abs( piOrg[5] - pred );
    1566     pred = (piCur[iStep6] + piRef[6] + bRound) >> 1 ;      uiSum += abs( piOrg[6] - pred );
    1567     pred = (piCur[iStep7] + piRef[7] + bRound) >> 1 ;      uiSum += abs( piOrg[7] - pred );
    1568     pred = (piCur[iStep8] + piRef[8] + bRound) >> 1 ;      uiSum += abs( piOrg[8] - pred );
    1569     pred = (piCur[iStep9] + piRef[9] + bRound) >> 1 ;      uiSum += abs( piOrg[9] - pred );
    1570     pred = (piCur[iStep10] + piRef[10] + bRound) >> 1 ;    uiSum += abs( piOrg[10] - pred );
    1571     pred = (piCur[iStep11] + piRef[11] + bRound) >> 1 ;    uiSum += abs( piOrg[11] - pred );
    1572     pred = (piCur[iStep12] + piRef[12] + bRound) >> 1 ;    uiSum += abs( piOrg[12] - pred );
    1573     pred = (piCur[iStep13] + piRef[13] + bRound) >> 1 ;    uiSum += abs( piOrg[13] - pred );
    1574     pred = (piCur[iStep14] + piRef[14] + bRound) >> 1 ;    uiSum += abs( piOrg[14] - pred );
    1575     pred = (piCur[iStep15] + piRef[15] + bRound) >> 1 ;    uiSum += abs( piOrg[15] - pred );
    1576    
    1577     piOrg += iStrideOrg;
    1578     piCur += iStrideCur;
    1579     piRef += iStrideRef;
    1580   }
    1581  
    1582   return ( uiSum >> g_uiBitIncrement );
    1583 }
    1584 
    1585 UInt TComRdCost::xGetSADs16N( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    1586 {
    1587   Pel* piOrg   = pcDtParam->pOrg;
    1588   Pel* piCur   = pcDtParam->pCur;
    1589   Pel* piRef   = pRefY;
    1590   Int  iRows   = pcDtParam->iRows;
    1591   Int  iCols   = pcDtParam->iCols;
    1592   Int  iStrideCur = pcDtParam->iStrideCur;
    1593   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1594   Int  iStrideRef = pcDtParam->iCols;
    1595   Int  iStep  = pcDtParam->iStep;
    1596   Pel  pred;
    1597  
    1598   UInt uiSum = 0;
    1599  
    1600   for( ; iRows != 0; iRows-- )
    1601   {
    1602     for (Int n = 0; n < iCols; n+=16 )
    1603     {
    1604       pred = (piCur[iStep*(n +0)] + piRef[n + 0] + bRound) >> 1 ;       uiSum += abs( piOrg[n +0] -  pred );
    1605       pred = (piCur[iStep*(n +1)] + piRef[n + 1] + bRound) >> 1 ;       uiSum += abs( piOrg[n +1] -  pred );
    1606       pred = (piCur[iStep*(n +2)] + piRef[n + 2] + bRound) >> 1 ;       uiSum += abs( piOrg[n +2] -  pred );
    1607       pred = (piCur[iStep*(n +3)] + piRef[n + 3] + bRound) >> 1 ;       uiSum += abs( piOrg[n +3] -  pred );
    1608       pred = (piCur[iStep*(n +4)] + piRef[n + 4] + bRound) >> 1 ;       uiSum += abs( piOrg[n +4] -  pred );
    1609       pred = (piCur[iStep*(n +5)] + piRef[n + 5] + bRound) >> 1 ;       uiSum += abs( piOrg[n +5] -  pred );
    1610       pred = (piCur[iStep*(n +6)] + piRef[n + 6] + bRound) >> 1 ;       uiSum += abs( piOrg[n +6] -  pred );
    1611       pred = (piCur[iStep*(n +7)] + piRef[n + 7] + bRound) >> 1 ;       uiSum += abs( piOrg[n +7] -  pred );
    1612       pred = (piCur[iStep*(n +8)] + piRef[n + 8] + bRound) >> 1 ;       uiSum += abs( piOrg[n +8] -  pred );
    1613       pred = (piCur[iStep*(n +9)] + piRef[n + 9] + bRound) >> 1 ;       uiSum += abs( piOrg[n +9] -  pred );
    1614       pred = (piCur[iStep*(n +10)] + piRef[n + 10] + bRound) >> 1 ;     uiSum += abs( piOrg[n +10] -  pred );
    1615       pred = (piCur[iStep*(n +11)] + piRef[n + 11] + bRound) >> 1 ;     uiSum += abs( piOrg[n +11] -  pred );
    1616       pred = (piCur[iStep*(n +12)] + piRef[n + 12] + bRound) >> 1 ;     uiSum += abs( piOrg[n +12] -  pred );
    1617       pred = (piCur[iStep*(n +13)] + piRef[n + 13] + bRound) >> 1 ;     uiSum += abs( piOrg[n +13] -  pred );
    1618       pred = (piCur[iStep*(n +14)] + piRef[n + 14] + bRound) >> 1 ;     uiSum += abs( piOrg[n +14] -  pred );
    1619       pred = (piCur[iStep*(n +15)] + piRef[n + 15] + bRound) >> 1 ;     uiSum += abs( piOrg[n +15] -  pred );
    1620     }
    1621     piOrg += iStrideOrg;
    1622     piCur += iStrideCur;
    1623     piRef += iStrideRef;
    1624   }
    1625  
    1626   return ( uiSum >> g_uiBitIncrement );
    1627 }
    1628 
    1629 UInt TComRdCost::xGetSADs32( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    1630 {
    1631   Pel* piOrg   = pcDtParam->pOrg;
    1632   Pel* piCur   = pcDtParam->pCur;
    1633   Pel* piRef   = pRefY;
    1634   Int  iRows   = pcDtParam->iRows;
    1635   Int  iStrideCur = pcDtParam->iStrideCur;
    1636   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1637   Int  iStrideRef = pcDtParam->iCols;
    1638   Int  iStep  = pcDtParam->iStep;
    1639   Int  iStep2  = iStep<<1;
    1640   Int  iStep3  = iStep2  + iStep;
    1641   Int  iStep4  = iStep3  + iStep;
    1642   Int  iStep5  = iStep4  + iStep;
    1643   Int  iStep6  = iStep5  + iStep;
    1644   Int  iStep7  = iStep6  + iStep;
    1645   Int  iStep8  = iStep7  + iStep;
    1646   Int  iStep9  = iStep8  + iStep;
    1647   Int  iStep10 = iStep9  + iStep;
    1648   Int  iStep11 = iStep10 + iStep;
    1649   Int  iStep12 = iStep11 + iStep;
    1650   Int  iStep13 = iStep12 + iStep;
    1651   Int  iStep14 = iStep13 + iStep;
    1652   Int  iStep15 = iStep14 + iStep;
    1653   Int  iStep16 = iStep15 + iStep;
    1654   Int  iStep17 = iStep16 + iStep;
    1655   Int  iStep18 = iStep17 + iStep;
    1656   Int  iStep19 = iStep18 + iStep;
    1657   Int  iStep20 = iStep19 + iStep;
    1658   Int  iStep21 = iStep20 + iStep;
    1659   Int  iStep22 = iStep21 + iStep;
    1660   Int  iStep23 = iStep22 + iStep;
    1661   Int  iStep24 = iStep23 + iStep;
    1662   Int  iStep25 = iStep24 + iStep;
    1663   Int  iStep26 = iStep25 + iStep;
    1664   Int  iStep27 = iStep26 + iStep;
    1665   Int  iStep28 = iStep27 + iStep;
    1666   Int  iStep29 = iStep28 + iStep;
    1667   Int  iStep30 = iStep29 + iStep;
    1668   Int  iStep31 = iStep30 + iStep;
    1669   Pel  pred;
    1670  
    1671   UInt uiSum = 0;
    1672  
    1673   for( ; iRows != 0; iRows-- )
    1674   {
    1675     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;           uiSum += abs( piOrg[0] - pred );
    1676     pred = (piCur[iStep ] + piRef[1] + bRound) >> 1 ;      uiSum += abs( piOrg[1] - pred );
    1677     pred = (piCur[iStep2] + piRef[2] + bRound) >> 1 ;      uiSum += abs( piOrg[2] - pred );
    1678     pred = (piCur[iStep3] + piRef[3] + bRound) >> 1 ;      uiSum += abs( piOrg[3] - pred );
    1679     pred = (piCur[iStep4] + piRef[4] + bRound) >> 1 ;      uiSum += abs( piOrg[4] - pred );
    1680     pred = (piCur[iStep5] + piRef[5] + bRound) >> 1 ;      uiSum += abs( piOrg[5] - pred );
    1681     pred = (piCur[iStep6] + piRef[6] + bRound) >> 1 ;      uiSum += abs( piOrg[6] - pred );
    1682     pred = (piCur[iStep7] + piRef[7] + bRound) >> 1 ;      uiSum += abs( piOrg[7] - pred );
    1683     pred = (piCur[iStep8] + piRef[8] + bRound) >> 1 ;      uiSum += abs( piOrg[8] - pred );
    1684     pred = (piCur[iStep9] + piRef[9] + bRound) >> 1 ;      uiSum += abs( piOrg[9] - pred );
    1685     pred = (piCur[iStep10] + piRef[10] + bRound) >> 1 ;    uiSum += abs( piOrg[10] - pred );
    1686     pred = (piCur[iStep11] + piRef[11] + bRound) >> 1 ;    uiSum += abs( piOrg[11] - pred );     
    1687     pred = (piCur[iStep12] + piRef[12] + bRound) >> 1 ;    uiSum += abs( piOrg[12] - pred );     
    1688     pred = (piCur[iStep13] + piRef[13] + bRound) >> 1 ;    uiSum += abs( piOrg[13] - pred );
    1689     pred = (piCur[iStep14] + piRef[14] + bRound) >> 1 ;    uiSum += abs( piOrg[14] - pred );
    1690     pred = (piCur[iStep15] + piRef[15] + bRound) >> 1 ;    uiSum += abs( piOrg[15] - pred );
    1691     pred = (piCur[iStep16] + piRef[16] + bRound) >> 1 ;      uiSum += abs( piOrg[16] - pred );
    1692     pred = (piCur[iStep17] + piRef[17] + bRound) >> 1 ;      uiSum += abs( piOrg[17] - pred );
    1693     pred = (piCur[iStep18] + piRef[18] + bRound) >> 1 ;      uiSum += abs( piOrg[18] - pred );
    1694     pred = (piCur[iStep19] + piRef[19] + bRound) >> 1 ;      uiSum += abs( piOrg[19] - pred );
    1695     pred = (piCur[iStep20] + piRef[20] + bRound) >> 1 ;      uiSum += abs( piOrg[20] - pred );
    1696     pred = (piCur[iStep21] + piRef[21] + bRound) >> 1 ;      uiSum += abs( piOrg[21] - pred );
    1697     pred = (piCur[iStep22] + piRef[22] + bRound) >> 1 ;      uiSum += abs( piOrg[22] - pred );
    1698     pred = (piCur[iStep23] + piRef[23] + bRound) >> 1 ;      uiSum += abs( piOrg[23] - pred );
    1699     pred = (piCur[iStep24] + piRef[24] + bRound) >> 1 ;      uiSum += abs( piOrg[24] - pred );
    1700     pred = (piCur[iStep25] + piRef[25] + bRound) >> 1 ;      uiSum += abs( piOrg[25] - pred );
    1701     pred = (piCur[iStep26] + piRef[26] + bRound) >> 1 ;      uiSum += abs( piOrg[26] - pred );
    1702     pred = (piCur[iStep27] + piRef[27] + bRound) >> 1 ;      uiSum += abs( piOrg[27] - pred );
    1703     pred = (piCur[iStep28] + piRef[28] + bRound) >> 1 ;      uiSum += abs( piOrg[28] - pred );
    1704     pred = (piCur[iStep29] + piRef[29] + bRound) >> 1 ;      uiSum += abs( piOrg[29] - pred );
    1705     pred = (piCur[iStep30] + piRef[30] + bRound) >> 1 ;      uiSum += abs( piOrg[30] - pred );
    1706     pred = (piCur[iStep31] + piRef[31] + bRound) >> 1 ;      uiSum += abs( piOrg[31] - pred );     
    1707    
    1708    
    1709     piOrg += iStrideOrg;
    1710     piCur += iStrideCur;
    1711     piRef += iStrideRef;
    1712   }
    1713  
    1714   return ( uiSum >> g_uiBitIncrement );
    1715 }
    1716 
    1717 UInt TComRdCost::xGetSADs64( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    1718 {
    1719   Pel* piOrg   = pcDtParam->pOrg;
    1720   Pel* piCur   = pcDtParam->pCur;
    1721   Pel* piRef   = pRefY;
    1722   Int  iRows   = pcDtParam->iRows;
    1723   Int  iStrideCur = pcDtParam->iStrideCur;
    1724   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1725   Int  iStrideRef = pcDtParam->iCols;
    1726   Int  iStep  = pcDtParam->iStep;
    1727   Int  iStep2  = iStep<<1;
    1728   Int  iStep3  = iStep2  + iStep;
    1729   Int  iStep4  = iStep3  + iStep;
    1730   Int  iStep5  = iStep4  + iStep;
    1731   Int  iStep6  = iStep5  + iStep;
    1732   Int  iStep7  = iStep6  + iStep;
    1733   Int  iStep8  = iStep7  + iStep;
    1734   Int  iStep9  = iStep8  + iStep;
    1735   Int  iStep10 = iStep9  + iStep;
    1736   Int  iStep11 = iStep10 + iStep;
    1737   Int  iStep12 = iStep11 + iStep;
    1738   Int  iStep13 = iStep12 + iStep;
    1739   Int  iStep14 = iStep13 + iStep;
    1740   Int  iStep15 = iStep14 + iStep;
    1741   Int  iStep16 = iStep15 + iStep;
    1742   Int  iStep17 = iStep16 + iStep;
    1743   Int  iStep18 = iStep17 + iStep;
    1744   Int  iStep19 = iStep18 + iStep;
    1745   Int  iStep20 = iStep19 + iStep;
    1746   Int  iStep21 = iStep20 + iStep;
    1747   Int  iStep22 = iStep21 + iStep;
    1748   Int  iStep23 = iStep22 + iStep;
    1749   Int  iStep24 = iStep23 + iStep;
    1750   Int  iStep25 = iStep24 + iStep;
    1751   Int  iStep26 = iStep25 + iStep;
    1752   Int  iStep27 = iStep26 + iStep;
    1753   Int  iStep28 = iStep27 + iStep;
    1754   Int  iStep29 = iStep28 + iStep;
    1755   Int  iStep30 = iStep29 + iStep;
    1756   Int  iStep31 = iStep30 + iStep;
    1757   Int  iStep32 = iStep31 + iStep;
    1758   Int  iStep33 = iStep32 + iStep;
    1759   Int  iStep34 = iStep33 + iStep;
    1760   Int  iStep35 = iStep34 + iStep;
    1761   Int  iStep36 = iStep35 + iStep;
    1762   Int  iStep37 = iStep36 + iStep;
    1763   Int  iStep38 = iStep37 + iStep;
    1764   Int  iStep39 = iStep38 + iStep;
    1765   Int  iStep40 = iStep39 + iStep;
    1766   Int  iStep41 = iStep40 + iStep;
    1767   Int  iStep42 = iStep41 + iStep;
    1768   Int  iStep43 = iStep42 + iStep;
    1769   Int  iStep44 = iStep43 + iStep;
    1770   Int  iStep45 = iStep44 + iStep;
    1771   Int  iStep46 = iStep45 + iStep;
    1772   Int  iStep47 = iStep46 + iStep;
    1773   Int  iStep48 = iStep47 + iStep;
    1774   Int  iStep49 = iStep48 + iStep;
    1775   Int  iStep50 = iStep49 + iStep;
    1776   Int  iStep51 = iStep50 + iStep;
    1777   Int  iStep52 = iStep51 + iStep;
    1778   Int  iStep53 = iStep52 + iStep;
    1779   Int  iStep54 = iStep53 + iStep;
    1780   Int  iStep55 = iStep54 + iStep;
    1781   Int  iStep56 = iStep55 + iStep;
    1782   Int  iStep57 = iStep56 + iStep;
    1783   Int  iStep58 = iStep57 + iStep;
    1784   Int  iStep59 = iStep58 + iStep;
    1785   Int  iStep60 = iStep59 + iStep;
    1786   Int  iStep61 = iStep60 + iStep;
    1787   Int  iStep62 = iStep61 + iStep;
    1788   Int  iStep63 = iStep62 + iStep;
    1789   Pel  pred;
    1790  
    1791   UInt uiSum = 0;
    1792  
    1793   for( ; iRows != 0; iRows-- )
    1794   {
    1795     pred = (piCur[0] + piRef[0] + bRound) >> 1 ;           uiSum += abs( piOrg[0] - pred );
    1796     pred = (piCur[iStep ] + piRef[1] + bRound) >> 1 ;      uiSum += abs( piOrg[1] - pred );
    1797     pred = (piCur[iStep2] + piRef[2] + bRound) >> 1 ;      uiSum += abs( piOrg[2] - pred );
    1798     pred = (piCur[iStep3] + piRef[3] + bRound) >> 1 ;      uiSum += abs( piOrg[3] - pred );
    1799     pred = (piCur[iStep4] + piRef[4] + bRound) >> 1 ;      uiSum += abs( piOrg[4] - pred );
    1800     pred = (piCur[iStep5] + piRef[5] + bRound) >> 1 ;      uiSum += abs( piOrg[5] - pred );
    1801     pred = (piCur[iStep6] + piRef[6] + bRound) >> 1 ;      uiSum += abs( piOrg[6] - pred );
    1802     pred = (piCur[iStep7] + piRef[7] + bRound) >> 1 ;      uiSum += abs( piOrg[7] - pred );
    1803     pred = (piCur[iStep8] + piRef[8] + bRound) >> 1 ;      uiSum += abs( piOrg[8] - pred );
    1804     pred = (piCur[iStep9] + piRef[9] + bRound) >> 1 ;      uiSum += abs( piOrg[9] - pred );
    1805    
    1806     pred = (piCur[iStep10] + piRef[10] + bRound) >> 1 ;    uiSum += abs( piOrg[10] - pred );
    1807     pred = (piCur[iStep11] + piRef[11] + bRound) >> 1 ;    uiSum += abs( piOrg[11] - pred );     
    1808     pred = (piCur[iStep12] + piRef[12] + bRound) >> 1 ;    uiSum += abs( piOrg[12] - pred );     
    1809     pred = (piCur[iStep13] + piRef[13] + bRound) >> 1 ;    uiSum += abs( piOrg[13] - pred );
    1810     pred = (piCur[iStep14] + piRef[14] + bRound) >> 1 ;    uiSum += abs( piOrg[14] - pred );
    1811     pred = (piCur[iStep15] + piRef[15] + bRound) >> 1 ;    uiSum += abs( piOrg[15] - pred );
    1812     pred = (piCur[iStep16] + piRef[16] + bRound) >> 1 ;      uiSum += abs( piOrg[16] - pred );
    1813     pred = (piCur[iStep17] + piRef[17] + bRound) >> 1 ;      uiSum += abs( piOrg[17] - pred );
    1814     pred = (piCur[iStep18] + piRef[18] + bRound) >> 1 ;      uiSum += abs( piOrg[18] - pred );
    1815     pred = (piCur[iStep19] + piRef[19] + bRound) >> 1 ;      uiSum += abs( piOrg[19] - pred );
    1816     pred = (piCur[iStep20] + piRef[20] + bRound) >> 1 ;      uiSum += abs( piOrg[20] - pred );
    1817    
    1818     pred = (piCur[iStep21] + piRef[21] + bRound) >> 1 ;      uiSum += abs( piOrg[21] - pred );
    1819     pred = (piCur[iStep22] + piRef[22] + bRound) >> 1 ;      uiSum += abs( piOrg[22] - pred );
    1820     pred = (piCur[iStep23] + piRef[23] + bRound) >> 1 ;      uiSum += abs( piOrg[23] - pred );
    1821     pred = (piCur[iStep24] + piRef[24] + bRound) >> 1 ;      uiSum += abs( piOrg[24] - pred );
    1822     pred = (piCur[iStep25] + piRef[25] + bRound) >> 1 ;      uiSum += abs( piOrg[25] - pred );
    1823     pred = (piCur[iStep26] + piRef[26] + bRound) >> 1 ;      uiSum += abs( piOrg[26] - pred );
    1824     pred = (piCur[iStep27] + piRef[27] + bRound) >> 1 ;      uiSum += abs( piOrg[27] - pred );
    1825     pred = (piCur[iStep28] + piRef[28] + bRound) >> 1 ;      uiSum += abs( piOrg[28] - pred );
    1826     pred = (piCur[iStep29] + piRef[29] + bRound) >> 1 ;      uiSum += abs( piOrg[29] - pred );
    1827    
    1828     pred = (piCur[iStep30] + piRef[30] + bRound) >> 1 ;    uiSum += abs( piOrg[30] - pred );
    1829     pred = (piCur[iStep31] + piRef[31] + bRound) >> 1 ;    uiSum += abs( piOrg[31] - pred );     
    1830     pred = (piCur[iStep32] + piRef[32] + bRound) >> 1 ;    uiSum += abs( piOrg[32] - pred );     
    1831     pred = (piCur[iStep33] + piRef[33] + bRound) >> 1 ;    uiSum += abs( piOrg[33] - pred );
    1832     pred = (piCur[iStep34] + piRef[34] + bRound) >> 1 ;    uiSum += abs( piOrg[34] - pred );
    1833     pred = (piCur[iStep35] + piRef[35] + bRound) >> 1 ;    uiSum += abs( piOrg[35] - pred );
    1834     pred = (piCur[iStep36] + piRef[36] + bRound) >> 1 ;      uiSum += abs( piOrg[36] - pred );
    1835     pred = (piCur[iStep37] + piRef[37] + bRound) >> 1 ;      uiSum += abs( piOrg[37] - pred );
    1836     pred = (piCur[iStep38] + piRef[38] + bRound) >> 1 ;      uiSum += abs( piOrg[38] - pred );
    1837     pred = (piCur[iStep39] + piRef[39] + bRound) >> 1 ;      uiSum += abs( piOrg[39] - pred );
    1838    
    1839     pred = (piCur[iStep40] + piRef[40] + bRound) >> 1 ;      uiSum += abs( piOrg[40] - pred );
    1840     pred = (piCur[iStep41] + piRef[41] + bRound) >> 1 ;      uiSum += abs( piOrg[41] - pred );
    1841     pred = (piCur[iStep42] + piRef[42] + bRound) >> 1 ;      uiSum += abs( piOrg[42] - pred );
    1842     pred = (piCur[iStep43] + piRef[43] + bRound) >> 1 ;      uiSum += abs( piOrg[43] - pred );
    1843     pred = (piCur[iStep44] + piRef[44] + bRound) >> 1 ;      uiSum += abs( piOrg[44] - pred );
    1844     pred = (piCur[iStep45] + piRef[45] + bRound) >> 1 ;      uiSum += abs( piOrg[45] - pred );
    1845     pred = (piCur[iStep46] + piRef[46] + bRound) >> 1 ;      uiSum += abs( piOrg[46] - pred );
    1846     pred = (piCur[iStep47] + piRef[47] + bRound) >> 1 ;      uiSum += abs( piOrg[47] - pred );
    1847     pred = (piCur[iStep48] + piRef[48] + bRound) >> 1 ;      uiSum += abs( piOrg[48] - pred );
    1848     pred = (piCur[iStep49] + piRef[49] + bRound) >> 1 ;      uiSum += abs( piOrg[49] - pred );
    1849    
    1850     pred = (piCur[iStep50] + piRef[50] + bRound) >> 1 ;    uiSum += abs( piOrg[50] - pred );
    1851     pred = (piCur[iStep51] + piRef[51] + bRound) >> 1 ;    uiSum += abs( piOrg[51] - pred );     
    1852     pred = (piCur[iStep52] + piRef[52] + bRound) >> 1 ;    uiSum += abs( piOrg[52] - pred );     
    1853     pred = (piCur[iStep53] + piRef[53] + bRound) >> 1 ;    uiSum += abs( piOrg[53] - pred );
    1854     pred = (piCur[iStep54] + piRef[54] + bRound) >> 1 ;    uiSum += abs( piOrg[54] - pred );
    1855     pred = (piCur[iStep55] + piRef[55] + bRound) >> 1 ;    uiSum += abs( piOrg[55] - pred );
    1856     pred = (piCur[iStep56] + piRef[56] + bRound) >> 1 ;      uiSum += abs( piOrg[56] - pred );
    1857     pred = (piCur[iStep57] + piRef[57] + bRound) >> 1 ;      uiSum += abs( piOrg[57] - pred );
    1858     pred = (piCur[iStep58] + piRef[58] + bRound) >> 1 ;      uiSum += abs( piOrg[58] - pred );
    1859     pred = (piCur[iStep59] + piRef[59] + bRound) >> 1 ;      uiSum += abs( piOrg[59] - pred );
    1860    
    1861     pred = (piCur[iStep60] + piRef[60] + bRound) >> 1 ;      uiSum += abs( piOrg[60] - pred );
    1862     pred = (piCur[iStep61] + piRef[61] + bRound) >> 1 ;      uiSum += abs( piOrg[61] - pred );
    1863     pred = (piCur[iStep62] + piRef[62] + bRound) >> 1 ;      uiSum += abs( piOrg[62] - pred );
    1864     pred = (piCur[iStep63] + piRef[63] + bRound) >> 1 ;      uiSum += abs( piOrg[63] - pred );
    1865    
    1866     piOrg += iStrideOrg;
    1867     piCur += iStrideCur;
    1868     piRef += iStrideRef;
    1869   }
    1870  
    1871   return ( uiSum >> g_uiBitIncrement );
    1872 }
    1873 #endif
    1874 
    1875 UInt TComRdCost::xGetSADs( DistParam* pcDtParam )
    1876 {
    1877 #ifdef WEIGHT_PRED
    1878   if ( pcDtParam->applyWeight )
    1879   {
    1880     return xGetSADsw( pcDtParam );
    1881   }
    1882 #endif
    1883   Pel* piOrg   = pcDtParam->pOrg;
    1884   Pel* piCur   = pcDtParam->pCur;
    1885   Int  iRows   = pcDtParam->iRows;
    1886   Int  iCols   = pcDtParam->iCols;
    1887   Int  iStrideCur = pcDtParam->iStrideCur;
    1888   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1889   Int  iStep  = pcDtParam->iStep;
    1890  
    1891   UInt uiSum = 0;
    1892  
    1893   for( ; iRows != 0; iRows-- )
    1894   {
    1895     for (Int n = 0; n < iCols; n++ )
    1896     {
    1897       uiSum += abs( piOrg[n] - piCur[n*iStep] );
    1898     }
    1899     piOrg += iStrideOrg;
    1900     piCur += iStrideCur;
    1901   }
    1902  
    1903   return ( uiSum >> g_uiBitIncrement );
    1904 }
    1905 
    1906 UInt TComRdCost::xGetSADs4( DistParam* pcDtParam )
    1907 {
    1908 #ifdef WEIGHT_PRED
    1909   if ( pcDtParam->applyWeight )
    1910   {
    1911     return xGetSADs4w( pcDtParam );
    1912   }
    1913 #endif
    1914   Pel* piOrg   = pcDtParam->pOrg;
    1915   Pel* piCur   = pcDtParam->pCur;
    1916   Int  iRows   = pcDtParam->iRows;
    1917   Int  iStrideCur = pcDtParam->iStrideCur;
    1918   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1919   Int  iStep  = pcDtParam->iStep;
    1920   Int  iStep2 = iStep<<1;
    1921   Int  iStep3 = iStep2 + iStep;
    1922  
    1923   UInt uiSum = 0;
    1924  
    1925   for( ; iRows != 0; iRows-- )
    1926   {
    1927     uiSum += abs( piOrg[0] - piCur[     0] );
    1928     uiSum += abs( piOrg[1] - piCur[iStep ] );
    1929     uiSum += abs( piOrg[2] - piCur[iStep2] );
    1930     uiSum += abs( piOrg[3] - piCur[iStep3] );
    1931    
    1932     piOrg += iStrideOrg;
    1933     piCur += iStrideCur;
    1934   }
    1935  
    1936   return ( uiSum >> g_uiBitIncrement );
    1937 }
    1938 
    1939 UInt TComRdCost::xGetSADs8( DistParam* pcDtParam )
    1940 {
    1941 #ifdef WEIGHT_PRED
    1942   if ( pcDtParam->applyWeight )
    1943   {
    1944     return xGetSADs8w( pcDtParam );
    1945   }
    1946 #endif
    1947   Pel* piOrg   = pcDtParam->pOrg;
    1948   Pel* piCur   = pcDtParam->pCur;
    1949   Int  iRows   = pcDtParam->iRows;
    1950   Int  iStrideCur = pcDtParam->iStrideCur;
    1951   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1952   Int  iStep  = pcDtParam->iStep;
    1953   Int  iStep2 = iStep<<1;
    1954   Int  iStep3 = iStep2 + iStep;
    1955   Int  iStep4 = iStep3 + iStep;
    1956   Int  iStep5 = iStep4 + iStep;
    1957   Int  iStep6 = iStep5 + iStep;
    1958   Int  iStep7 = iStep6 + iStep;
    1959  
    1960   UInt uiSum = 0;
    1961  
    1962   for( ; iRows != 0; iRows-- )
    1963   {
    1964     uiSum += abs( piOrg[0] - piCur[     0] );
    1965     uiSum += abs( piOrg[1] - piCur[iStep ] );
    1966     uiSum += abs( piOrg[2] - piCur[iStep2] );
    1967     uiSum += abs( piOrg[3] - piCur[iStep3] );
    1968     uiSum += abs( piOrg[4] - piCur[iStep4] );
    1969     uiSum += abs( piOrg[5] - piCur[iStep5] );
    1970     uiSum += abs( piOrg[6] - piCur[iStep6] );
    1971     uiSum += abs( piOrg[7] - piCur[iStep7] );
    1972    
    1973     piOrg += iStrideOrg;
    1974     piCur += iStrideCur;
    1975   }
    1976  
    1977   return ( uiSum >> g_uiBitIncrement );
    1978 }
    1979 
    1980 UInt TComRdCost::xGetSADs16( DistParam* pcDtParam )
    1981 {
    1982 #ifdef WEIGHT_PRED
    1983   if ( pcDtParam->applyWeight )
    1984   {
    1985     return xGetSADs16w( pcDtParam );
    1986   }
    1987 #endif
    1988   Pel* piOrg   = pcDtParam->pOrg;
    1989   Pel* piCur   = pcDtParam->pCur;
    1990   Int  iRows   = pcDtParam->iRows;
    1991   Int  iStrideCur = pcDtParam->iStrideCur;
    1992   Int  iStrideOrg = pcDtParam->iStrideOrg;
    1993   Int  iStep   = pcDtParam->iStep;
    1994   Int  iStep2  = iStep<<1;
    1995   Int  iStep3  = iStep2  + iStep;
    1996   Int  iStep4  = iStep3  + iStep;
    1997   Int  iStep5  = iStep4  + iStep;
    1998   Int  iStep6  = iStep5  + iStep;
    1999   Int  iStep7  = iStep6  + iStep;
    2000   Int  iStep8  = iStep7  + iStep;
    2001   Int  iStep9  = iStep8  + iStep;
    2002   Int  iStep10 = iStep9  + iStep;
    2003   Int  iStep11 = iStep10 + iStep;
    2004   Int  iStep12 = iStep11 + iStep;
    2005   Int  iStep13 = iStep12 + iStep;
    2006   Int  iStep14 = iStep13 + iStep;
    2007   Int  iStep15 = iStep14 + iStep;
    2008  
    2009   UInt uiSum = 0;
    2010  
    2011   for( ; iRows != 0; iRows-- )
    2012   {
    2013     uiSum += abs( piOrg[ 0] - piCur[      0] );
    2014     uiSum += abs( piOrg[ 1] - piCur[iStep  ] );
    2015     uiSum += abs( piOrg[ 2] - piCur[iStep2 ] );
    2016     uiSum += abs( piOrg[ 3] - piCur[iStep3 ] );
    2017     uiSum += abs( piOrg[ 4] - piCur[iStep4 ] );
    2018     uiSum += abs( piOrg[ 5] - piCur[iStep5 ] );
    2019     uiSum += abs( piOrg[ 6] - piCur[iStep6 ] );
    2020     uiSum += abs( piOrg[ 7] - piCur[iStep7 ] );
    2021     uiSum += abs( piOrg[ 8] - piCur[iStep8 ] );
    2022     uiSum += abs( piOrg[ 9] - piCur[iStep9 ] );
    2023     uiSum += abs( piOrg[10] - piCur[iStep10] );
    2024     uiSum += abs( piOrg[11] - piCur[iStep11] );
    2025     uiSum += abs( piOrg[12] - piCur[iStep12] );
    2026     uiSum += abs( piOrg[13] - piCur[iStep13] );
    2027     uiSum += abs( piOrg[14] - piCur[iStep14] );
    2028     uiSum += abs( piOrg[15] - piCur[iStep15] );
    2029    
    2030     piOrg += iStrideOrg;
    2031     piCur += iStrideCur;
    2032   }
    2033  
    2034   return ( uiSum >> g_uiBitIncrement );
    2035 }
    2036 
    2037 UInt TComRdCost::xGetSADs16N( DistParam* pcDtParam )
    2038 {
    2039 #ifdef WEIGHT_PRED
    2040   if ( pcDtParam->applyWeight )
    2041   {
    2042     return xGetSADs16Nw( pcDtParam );
    2043   }
    2044 #endif
    2045   Pel* piOrg   = pcDtParam->pOrg;
    2046   Pel* piCur   = pcDtParam->pCur;
    2047   Int  iRows   = pcDtParam->iRows;
    2048   Int  iCols   = pcDtParam->iCols;
    2049   Int  iStrideCur = pcDtParam->iStrideCur;
    2050   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2051   Int  iStep  = pcDtParam->iStep;
    2052  
    2053   UInt uiSum = 0;
    2054  
    2055   for( ; iRows != 0; iRows-- )
    2056   {
    2057     for (Int n = 0; n < iCols; n+=16 )
    2058     {
    2059       uiSum += abs( piOrg[n +0] - piCur[iStep*(n +0)] );
    2060       uiSum += abs( piOrg[n +1] - piCur[iStep*(n +1)] );
    2061       uiSum += abs( piOrg[n +2] - piCur[iStep*(n +2)] );
    2062       uiSum += abs( piOrg[n +3] - piCur[iStep*(n +3)] );
    2063       uiSum += abs( piOrg[n +4] - piCur[iStep*(n +4)] );
    2064       uiSum += abs( piOrg[n +5] - piCur[iStep*(n +5)] );
    2065       uiSum += abs( piOrg[n +6] - piCur[iStep*(n +6)] );
    2066       uiSum += abs( piOrg[n +7] - piCur[iStep*(n +7)] );
    2067       uiSum += abs( piOrg[n +8] - piCur[iStep*(n +8)] );
    2068       uiSum += abs( piOrg[n +9] - piCur[iStep*(n +9)] );
    2069       uiSum += abs( piOrg[n+10] - piCur[iStep*(n+10)] );
    2070       uiSum += abs( piOrg[n+11] - piCur[iStep*(n+11)] );
    2071       uiSum += abs( piOrg[n+12] - piCur[iStep*(n+12)] );
    2072       uiSum += abs( piOrg[n+13] - piCur[iStep*(n+13)] );
    2073       uiSum += abs( piOrg[n+14] - piCur[iStep*(n+14)] );
    2074       uiSum += abs( piOrg[n+15] - piCur[iStep*(n+15)] );
    2075     }
    2076     piOrg += iStrideOrg;
    2077     piCur += iStrideCur;
    2078   }
    2079  
    2080   return ( uiSum >> g_uiBitIncrement );
    2081 }
    2082 
    2083 UInt TComRdCost::xGetSADs32( DistParam* pcDtParam )
    2084 {
    2085 #ifdef WEIGHT_PRED
    2086   if ( pcDtParam->applyWeight )
    2087   {
    2088     return xGetSADs32w( pcDtParam );
    2089   }
    2090 #endif
    2091   Pel* piOrg   = pcDtParam->pOrg;
    2092   Pel* piCur   = pcDtParam->pCur;
    2093   Int  iRows   = pcDtParam->iRows;
    2094   Int  iStrideCur = pcDtParam->iStrideCur;
    2095   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2096   Int  iStep  = pcDtParam->iStep;
    2097   Int  iStep2  = iStep<<1;
    2098   Int  iStep3  = iStep2  + iStep;
    2099   Int  iStep4  = iStep3  + iStep;
    2100   Int  iStep5  = iStep4  + iStep;
    2101   Int  iStep6  = iStep5  + iStep;
    2102   Int  iStep7  = iStep6  + iStep;
    2103   Int  iStep8  = iStep7  + iStep;
    2104   Int  iStep9  = iStep8  + iStep;
    2105   Int  iStep10 = iStep9  + iStep;
    2106   Int  iStep11 = iStep10 + iStep;
    2107   Int  iStep12 = iStep11 + iStep;
    2108   Int  iStep13 = iStep12 + iStep;
    2109   Int  iStep14 = iStep13 + iStep;
    2110   Int  iStep15 = iStep14 + iStep;
    2111   Int  iStep16 = iStep15 + iStep;
    2112   Int  iStep17 = iStep16 + iStep;
    2113   Int  iStep18 = iStep17 + iStep;
    2114   Int  iStep19 = iStep18 + iStep;
    2115   Int  iStep20 = iStep19 + iStep;
    2116   Int  iStep21 = iStep20 + iStep;
    2117   Int  iStep22 = iStep21 + iStep;
    2118   Int  iStep23 = iStep22 + iStep;
    2119   Int  iStep24 = iStep23 + iStep;
    2120   Int  iStep25 = iStep24 + iStep;
    2121   Int  iStep26 = iStep25 + iStep;
    2122   Int  iStep27 = iStep26 + iStep;
    2123   Int  iStep28 = iStep27 + iStep;
    2124   Int  iStep29 = iStep28 + iStep;
    2125   Int  iStep30 = iStep29 + iStep;
    2126   Int  iStep31 = iStep30 + iStep;
    2127  
    2128   UInt uiSum = 0;
    2129  
    2130   for( ; iRows != 0; iRows-- )
    2131   {
    2132     uiSum += abs( piOrg[ 0] - piCur[      0] );
    2133     uiSum += abs( piOrg[ 1] - piCur[iStep  ] );
    2134     uiSum += abs( piOrg[ 2] - piCur[iStep2 ] );
    2135     uiSum += abs( piOrg[ 3] - piCur[iStep3 ] );
    2136     uiSum += abs( piOrg[ 4] - piCur[iStep4 ] );
    2137     uiSum += abs( piOrg[ 5] - piCur[iStep5 ] );
    2138     uiSum += abs( piOrg[ 6] - piCur[iStep6 ] );
    2139     uiSum += abs( piOrg[ 7] - piCur[iStep7 ] );
    2140     uiSum += abs( piOrg[ 8] - piCur[iStep8 ] );
    2141     uiSum += abs( piOrg[ 9] - piCur[iStep9 ] );
    2142     uiSum += abs( piOrg[10] - piCur[iStep10] );
    2143     uiSum += abs( piOrg[11] - piCur[iStep11] );
    2144     uiSum += abs( piOrg[12] - piCur[iStep12] );
    2145     uiSum += abs( piOrg[13] - piCur[iStep13] );
    2146     uiSum += abs( piOrg[14] - piCur[iStep14] );
    2147     uiSum += abs( piOrg[15] - piCur[iStep15] );
    2148     uiSum += abs( piOrg[16] - piCur[iStep16] );
    2149     uiSum += abs( piOrg[17] - piCur[iStep17] );
    2150     uiSum += abs( piOrg[18] - piCur[iStep18] );
    2151     uiSum += abs( piOrg[19] - piCur[iStep19] );
    2152     uiSum += abs( piOrg[20] - piCur[iStep20] );
    2153     uiSum += abs( piOrg[21] - piCur[iStep21] );
    2154     uiSum += abs( piOrg[22] - piCur[iStep22] );
    2155     uiSum += abs( piOrg[23] - piCur[iStep23] );
    2156     uiSum += abs( piOrg[24] - piCur[iStep24] );
    2157     uiSum += abs( piOrg[25] - piCur[iStep25] );
    2158     uiSum += abs( piOrg[26] - piCur[iStep26] );
    2159     uiSum += abs( piOrg[27] - piCur[iStep27] );
    2160     uiSum += abs( piOrg[28] - piCur[iStep28] );
    2161     uiSum += abs( piOrg[29] - piCur[iStep29] );
    2162     uiSum += abs( piOrg[30] - piCur[iStep30] );
    2163     uiSum += abs( piOrg[31] - piCur[iStep31] );
    2164    
    2165     piOrg += iStrideOrg;
    2166     piCur += iStrideCur;
    2167   }
    2168  
    2169   return ( uiSum >> g_uiBitIncrement );
    2170 }
    2171 
    2172 UInt TComRdCost::xGetSADs64( DistParam* pcDtParam )
    2173 {
    2174 #ifdef WEIGHT_PRED
    2175   if ( pcDtParam->applyWeight )
    2176   {
    2177     return xGetSADs64w( pcDtParam );
    2178   }
    2179 #endif
    2180   Pel* piOrg   = pcDtParam->pOrg;
    2181   Pel* piCur   = pcDtParam->pCur;
    2182   Int  iRows   = pcDtParam->iRows;
    2183   Int  iStrideCur = pcDtParam->iStrideCur;
    2184   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2185   Int  iStep  = pcDtParam->iStep;
    2186   Int  iStep2  = iStep<<1;
    2187   Int  iStep3  = iStep2  + iStep;
    2188   Int  iStep4  = iStep3  + iStep;
    2189   Int  iStep5  = iStep4  + iStep;
    2190   Int  iStep6  = iStep5  + iStep;
    2191   Int  iStep7  = iStep6  + iStep;
    2192   Int  iStep8  = iStep7  + iStep;
    2193   Int  iStep9  = iStep8  + iStep;
    2194   Int  iStep10 = iStep9  + iStep;
    2195   Int  iStep11 = iStep10 + iStep;
    2196   Int  iStep12 = iStep11 + iStep;
    2197   Int  iStep13 = iStep12 + iStep;
    2198   Int  iStep14 = iStep13 + iStep;
    2199   Int  iStep15 = iStep14 + iStep;
    2200   Int  iStep16 = iStep15 + iStep;
    2201   Int  iStep17 = iStep16 + iStep;
    2202   Int  iStep18 = iStep17 + iStep;
    2203   Int  iStep19 = iStep18 + iStep;
    2204   Int  iStep20 = iStep19 + iStep;
    2205   Int  iStep21 = iStep20 + iStep;
    2206   Int  iStep22 = iStep21 + iStep;
    2207   Int  iStep23 = iStep22 + iStep;
    2208   Int  iStep24 = iStep23 + iStep;
    2209   Int  iStep25 = iStep24 + iStep;
    2210   Int  iStep26 = iStep25 + iStep;
    2211   Int  iStep27 = iStep26 + iStep;
    2212   Int  iStep28 = iStep27 + iStep;
    2213   Int  iStep29 = iStep28 + iStep;
    2214   Int  iStep30 = iStep29 + iStep;
    2215   Int  iStep31 = iStep30 + iStep;
    2216   Int  iStep32 = iStep31 + iStep;
    2217   Int  iStep33 = iStep32 + iStep;
    2218   Int  iStep34 = iStep33 + iStep;
    2219   Int  iStep35 = iStep34 + iStep;
    2220   Int  iStep36 = iStep35 + iStep;
    2221   Int  iStep37 = iStep36 + iStep;
    2222   Int  iStep38 = iStep37 + iStep;
    2223   Int  iStep39 = iStep38 + iStep;
    2224   Int  iStep40 = iStep39 + iStep;
    2225   Int  iStep41 = iStep40 + iStep;
    2226   Int  iStep42 = iStep41 + iStep;
    2227   Int  iStep43 = iStep42 + iStep;
    2228   Int  iStep44 = iStep43 + iStep;
    2229   Int  iStep45 = iStep44 + iStep;
    2230   Int  iStep46 = iStep45 + iStep;
    2231   Int  iStep47 = iStep46 + iStep;
    2232   Int  iStep48 = iStep47 + iStep;
    2233   Int  iStep49 = iStep48 + iStep;
    2234   Int  iStep50 = iStep49 + iStep;
    2235   Int  iStep51 = iStep50 + iStep;
    2236   Int  iStep52 = iStep51 + iStep;
    2237   Int  iStep53 = iStep52 + iStep;
    2238   Int  iStep54 = iStep53 + iStep;
    2239   Int  iStep55 = iStep54 + iStep;
    2240   Int  iStep56 = iStep55 + iStep;
    2241   Int  iStep57 = iStep56 + iStep;
    2242   Int  iStep58 = iStep57 + iStep;
    2243   Int  iStep59 = iStep58 + iStep;
    2244   Int  iStep60 = iStep59 + iStep;
    2245   Int  iStep61 = iStep60 + iStep;
    2246   Int  iStep62 = iStep61 + iStep;
    2247   Int  iStep63 = iStep62 + iStep;
    2248  
    2249   UInt uiSum = 0;
    2250  
    2251   for( ; iRows != 0; iRows-- )
    2252   {
    2253     uiSum += abs( piOrg[ 0] - piCur[      0] );
    2254     uiSum += abs( piOrg[ 1] - piCur[iStep  ] );
    2255     uiSum += abs( piOrg[ 2] - piCur[iStep2 ] );
    2256     uiSum += abs( piOrg[ 3] - piCur[iStep3 ] );
    2257     uiSum += abs( piOrg[ 4] - piCur[iStep4 ] );
    2258     uiSum += abs( piOrg[ 5] - piCur[iStep5 ] );
    2259     uiSum += abs( piOrg[ 6] - piCur[iStep6 ] );
    2260     uiSum += abs( piOrg[ 7] - piCur[iStep7 ] );
    2261     uiSum += abs( piOrg[ 8] - piCur[iStep8 ] );
    2262     uiSum += abs( piOrg[ 9] - piCur[iStep9 ] );
    2263     uiSum += abs( piOrg[10] - piCur[iStep10] );
    2264     uiSum += abs( piOrg[11] - piCur[iStep11] );
    2265     uiSum += abs( piOrg[12] - piCur[iStep12] );
    2266     uiSum += abs( piOrg[13] - piCur[iStep13] );
    2267     uiSum += abs( piOrg[14] - piCur[iStep14] );
    2268     uiSum += abs( piOrg[15] - piCur[iStep15] );
    2269     uiSum += abs( piOrg[16] - piCur[iStep16] );
    2270     uiSum += abs( piOrg[17] - piCur[iStep17] );
    2271     uiSum += abs( piOrg[18] - piCur[iStep18] );
    2272     uiSum += abs( piOrg[19] - piCur[iStep19] );
    2273     uiSum += abs( piOrg[20] - piCur[iStep20] );
    2274     uiSum += abs( piOrg[21] - piCur[iStep21] );
    2275     uiSum += abs( piOrg[22] - piCur[iStep22] );
    2276     uiSum += abs( piOrg[23] - piCur[iStep23] );
    2277     uiSum += abs( piOrg[24] - piCur[iStep24] );
    2278     uiSum += abs( piOrg[25] - piCur[iStep25] );
    2279     uiSum += abs( piOrg[26] - piCur[iStep26] );
    2280     uiSum += abs( piOrg[27] - piCur[iStep27] );
    2281     uiSum += abs( piOrg[28] - piCur[iStep28] );
    2282     uiSum += abs( piOrg[29] - piCur[iStep29] );
    2283     uiSum += abs( piOrg[30] - piCur[iStep30] );
    2284     uiSum += abs( piOrg[31] - piCur[iStep31] );
    2285     uiSum += abs( piOrg[32] - piCur[iStep32] );
    2286     uiSum += abs( piOrg[33] - piCur[iStep33] );
    2287     uiSum += abs( piOrg[34] - piCur[iStep34] );
    2288     uiSum += abs( piOrg[35] - piCur[iStep35] );
    2289     uiSum += abs( piOrg[36] - piCur[iStep36] );
    2290     uiSum += abs( piOrg[37] - piCur[iStep37] );
    2291     uiSum += abs( piOrg[38] - piCur[iStep38] );
    2292     uiSum += abs( piOrg[39] - piCur[iStep39] );
    2293     uiSum += abs( piOrg[40] - piCur[iStep40] );
    2294     uiSum += abs( piOrg[41] - piCur[iStep41] );
    2295     uiSum += abs( piOrg[42] - piCur[iStep42] );
    2296     uiSum += abs( piOrg[43] - piCur[iStep43] );
    2297     uiSum += abs( piOrg[44] - piCur[iStep44] );
    2298     uiSum += abs( piOrg[45] - piCur[iStep45] );
    2299     uiSum += abs( piOrg[46] - piCur[iStep46] );
    2300     uiSum += abs( piOrg[47] - piCur[iStep47] );
    2301     uiSum += abs( piOrg[48] - piCur[iStep48] );
    2302     uiSum += abs( piOrg[49] - piCur[iStep49] );
    2303     uiSum += abs( piOrg[50] - piCur[iStep50] );
    2304     uiSum += abs( piOrg[51] - piCur[iStep51] );
    2305     uiSum += abs( piOrg[52] - piCur[iStep52] );
    2306     uiSum += abs( piOrg[53] - piCur[iStep53] );
    2307     uiSum += abs( piOrg[54] - piCur[iStep54] );
    2308     uiSum += abs( piOrg[55] - piCur[iStep55] );
    2309     uiSum += abs( piOrg[56] - piCur[iStep56] );
    2310     uiSum += abs( piOrg[57] - piCur[iStep57] );
    2311     uiSum += abs( piOrg[58] - piCur[iStep58] );
    2312     uiSum += abs( piOrg[59] - piCur[iStep59] );
    2313     uiSum += abs( piOrg[60] - piCur[iStep60] );
    2314     uiSum += abs( piOrg[61] - piCur[iStep61] );
    2315     uiSum += abs( piOrg[62] - piCur[iStep62] );
    2316     uiSum += abs( piOrg[63] - piCur[iStep63] );
    2317    
    2318     piOrg += iStrideOrg;
    2319     piCur += iStrideCur;
    2320   }
    2321  
    2322   return ( uiSum >> g_uiBitIncrement );
    2323 }
     1052#endif
    23241053
    23251054// --------------------------------------------------------------------------------------------------------------------
     
    23281057
    23291058#if IBDI_DISTORTION
    2330 #ifdef ROUNDING_CONTROL_BIPRED
    2331 UInt TComRdCost::xGetSSE( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    2332 {
    2333   Pel* piOrg   = pcDtParam->pOrg;
    2334   Pel* piCur   = pcDtParam->pCur;
    2335   Pel* piRef   = pRefY;
    2336   Int  iRows   = pcDtParam->iRows;
    2337   Int  iCols   = pcDtParam->iCols;
    2338   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2339   Int  iStrideCur = pcDtParam->iStrideCur;
    2340   Pel  pred;
    2341 
    2342   UInt uiSum = 0;
    2343   Int  iShift = g_uiBitIncrement;
    2344   Int  iOffset = (g_uiBitIncrement>0)? (1<<(g_uiBitIncrement-1)):0;
    2345 
    2346   Int iTemp;
    2347 
    2348   for( ; iRows != 0; iRows-- )
    2349   {
    2350     for (Int n = 0; n < iCols; n++ )
    2351     {
    2352       pred = (piCur[n] + piRef[n] + bRound) >> 1 ;
    2353       iTemp = ((piOrg[n]+iOffset)>>iShift) - ((pred+iOffset)>>iShift);
    2354       uiSum += iTemp * iTemp;
    2355     }
    2356     piOrg += iStrideOrg;
    2357     piCur += iStrideCur;
    2358     piRef += iCols;
    2359   }
    2360 
    2361   return ( uiSum );
    2362 }
    2363 
    2364 UInt TComRdCost::xGetSSE4( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    2365 {
    2366   Pel* piOrg   = pcDtParam->pOrg;
    2367   Pel* piCur   = pcDtParam->pCur;
    2368   Pel* piRef   = pRefY;
    2369   Int  iRows   = pcDtParam->iRows;
    2370   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2371   Int  iStrideCur = pcDtParam->iStrideCur;
    2372   Int  iStrideRef =  pcDtParam->iCols;
    2373   Pel  pred;
    2374 
    2375   UInt uiSum = 0;
    2376   Int  iShift = g_uiBitIncrement;
    2377   Int  iOffset = (g_uiBitIncrement>0)? (1<<(g_uiBitIncrement-1)):0;
    2378 
    2379   Int  iTemp;
    2380 
    2381   for( ; iRows != 0; iRows-- )
    2382   {
    2383     pred = (piCur[0] + piRef[0] + bRound) >> 1;    iTemp = ((piOrg[0]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2384     pred = (piCur[1] + piRef[1] + bRound) >> 1;    iTemp = ((piOrg[1]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2385     pred = (piCur[2] + piRef[2] + bRound) >> 1;    iTemp = ((piOrg[2]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2386     pred = (piCur[3] + piRef[3] + bRound) >> 1;    iTemp = ((piOrg[3]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2387    
    2388     piOrg += iStrideOrg;
    2389     piCur += iStrideCur;
    2390     piRef += iStrideRef;
    2391   }
    2392 
    2393   return ( uiSum );
    2394 }
    2395 
    2396 UInt TComRdCost::xGetSSE8( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    2397 {
    2398   Pel* piOrg   = pcDtParam->pOrg;
    2399   Pel* piCur   = pcDtParam->pCur;
    2400   Pel* piRef   = pRefY;
    2401   Int  iRows   = pcDtParam->iRows;
    2402   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2403   Int  iStrideCur = pcDtParam->iStrideCur;
    2404   Int  iStrideRef =  pcDtParam->iCols;
    2405   Pel  pred;
    2406 
    2407   UInt uiSum = 0;
    2408   Int  iShift = g_uiBitIncrement;
    2409   Int  iOffset = (g_uiBitIncrement>0)? (1<<(g_uiBitIncrement-1)):0;
    2410 
    2411   Int  iTemp;
    2412 
    2413   for( ; iRows != 0; iRows-- )
    2414   {
    2415     pred = (piCur[0] + piRef[0] + bRound) >> 1;    iTemp = ((piOrg[0]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2416     pred = (piCur[1] + piRef[1] + bRound) >> 1;    iTemp = ((piOrg[1]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2417     pred = (piCur[2] + piRef[2] + bRound) >> 1;    iTemp = ((piOrg[2]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2418     pred = (piCur[3] + piRef[3] + bRound) >> 1;    iTemp = ((piOrg[3]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2419     pred = (piCur[4] + piRef[4] + bRound) >> 1;    iTemp = ((piOrg[4]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2420     pred = (piCur[5] + piRef[5] + bRound) >> 1;    iTemp = ((piOrg[5]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2421     pred = (piCur[6] + piRef[6] + bRound) >> 1;    iTemp = ((piOrg[6]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2422     pred = (piCur[7] + piRef[7] + bRound) >> 1;    iTemp = ((piOrg[7]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2423 
    2424     piOrg += iStrideOrg;
    2425     piCur += iStrideCur;
    2426     piRef += iStrideRef;
    2427   }
    2428 
    2429   return ( uiSum );
    2430 }
    2431 
    2432 UInt TComRdCost::xGetSSE16( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    2433 {
    2434   Pel* piOrg   = pcDtParam->pOrg;
    2435   Pel* piCur   = pcDtParam->pCur;
    2436   Pel* piRef   = pRefY;
    2437   Int  iRows   = pcDtParam->iRows;
    2438   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2439   Int  iStrideCur = pcDtParam->iStrideCur;
    2440   Int  iStrideRef =  pcDtParam->iCols;
    2441   Pel  pred;
    2442 
    2443   UInt uiSum = 0;
    2444   Int  iShift = g_uiBitIncrement;
    2445   Int  iOffset = (g_uiBitIncrement>0)? (1<<(g_uiBitIncrement-1)):0;
    2446 
    2447   Int  iTemp;
    2448 
    2449   for( ; iRows != 0; iRows-- )
    2450   {
    2451     pred = (piCur[0] + piRef[0] + bRound) >> 1;    iTemp = ((piOrg[0]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2452     pred = (piCur[1] + piRef[1] + bRound) >> 1;    iTemp = ((piOrg[1]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2453     pred = (piCur[2] + piRef[2] + bRound) >> 1;    iTemp = ((piOrg[2]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2454     pred = (piCur[3] + piRef[3] + bRound) >> 1;    iTemp = ((piOrg[3]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2455     pred = (piCur[4] + piRef[4] + bRound) >> 1;    iTemp = ((piOrg[4]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2456     pred = (piCur[5] + piRef[5] + bRound) >> 1;    iTemp = ((piOrg[5]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2457     pred = (piCur[6] + piRef[6] + bRound) >> 1;    iTemp = ((piOrg[6]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2458     pred = (piCur[7] + piRef[7] + bRound) >> 1;    iTemp = ((piOrg[7]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2459     pred = (piCur[8] + piRef[8] + bRound) >> 1;    iTemp = ((piOrg[8]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2460     pred = (piCur[9] + piRef[9] + bRound) >> 1;    iTemp = ((piOrg[9]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2461     pred = (piCur[10] + piRef[10] + bRound) >> 1;    iTemp = ((piOrg[10]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2462     pred = (piCur[11] + piRef[11] + bRound) >> 1;    iTemp = ((piOrg[11]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2463     pred = (piCur[12] + piRef[12] + bRound) >> 1;    iTemp = ((piOrg[12]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2464     pred = (piCur[13] + piRef[13] + bRound) >> 1;    iTemp = ((piOrg[13]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2465     pred = (piCur[14] + piRef[14] + bRound) >> 1;    iTemp = ((piOrg[14]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2466     pred = (piCur[15] + piRef[15] + bRound) >> 1;    iTemp = ((piOrg[15]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2467 
    2468     piOrg += iStrideOrg;
    2469     piCur += iStrideCur;
    2470     piRef += iStrideRef;
    2471   }
    2472 
    2473   return ( uiSum );
    2474 }
    2475 
    2476 UInt TComRdCost::xGetSSE16N( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    2477 {
    2478   Pel* piOrg   = pcDtParam->pOrg;
    2479   Pel* piCur   = pcDtParam->pCur;
    2480   Pel* piRef   = pRefY;
    2481   Int  iRows   = pcDtParam->iRows;
    2482   Int  iCols   = pcDtParam->iCols;
    2483   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2484   Int  iStrideCur = pcDtParam->iStrideCur;
    2485   Pel  pred;
    2486 
    2487   UInt uiSum = 0;
    2488   Int  iShift = g_uiBitIncrement;
    2489   Int  iOffset = (g_uiBitIncrement>0)? (1<<(g_uiBitIncrement-1)):0;
    2490   Int  iTemp;
    2491 
    2492   for( ; iRows != 0; iRows-- )
    2493   {
    2494     for (Int n = 0; n < iCols; n+=16 )
    2495     {
    2496       pred = (piCur[n+ 0] + piRef[n+ 0] + bRound) >> 1;    iTemp = ((piOrg[n+ 0]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2497       pred = (piCur[n+ 1] + piRef[n+ 1] + bRound) >> 1;    iTemp = ((piOrg[n+ 1]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2498       pred = (piCur[n+ 2] + piRef[n+ 2] + bRound) >> 1;    iTemp = ((piOrg[n+ 2]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2499       pred = (piCur[n+ 3] + piRef[n+ 3] + bRound) >> 1;    iTemp = ((piOrg[n+ 3]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2500       pred = (piCur[n+ 4] + piRef[n+ 4] + bRound) >> 1;    iTemp = ((piOrg[n+ 4]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2501       pred = (piCur[n+ 5] + piRef[n+ 5] + bRound) >> 1;    iTemp = ((piOrg[n+ 5]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2502       pred = (piCur[n+ 6] + piRef[n+ 6] + bRound) >> 1;    iTemp = ((piOrg[n+ 6]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2503       pred = (piCur[n+ 7] + piRef[n+ 7] + bRound) >> 1;    iTemp = ((piOrg[n+ 7]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2504       pred = (piCur[n+ 8] + piRef[n+ 8] + bRound) >> 1;    iTemp = ((piOrg[n+ 8]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2505       pred = (piCur[n+ 9] + piRef[n+ 9] + bRound) >> 1;    iTemp = ((piOrg[n+ 9]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2506       pred = (piCur[n+ 10] + piRef[n+ 10] + bRound) >> 1;    iTemp = ((piOrg[n+ 10]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2507       pred = (piCur[n+ 11] + piRef[n+ 11] + bRound) >> 1;    iTemp = ((piOrg[n+ 11]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2508       pred = (piCur[n+ 12] + piRef[n+ 12] + bRound) >> 1;    iTemp = ((piOrg[n+ 12]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2509       pred = (piCur[n+ 13] + piRef[n+ 13] + bRound) >> 1;    iTemp = ((piOrg[n+ 13]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2510       pred = (piCur[n+ 14] + piRef[n+ 14] + bRound) >> 1;    iTemp = ((piOrg[n+ 14]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2511       pred = (piCur[n+ 15] + piRef[n+ 15] + bRound) >> 1;    iTemp = ((piOrg[n+ 15]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2512     }
    2513     piOrg += iStrideOrg;
    2514     piCur += iStrideCur;
    2515     piRef += iCols;
    2516   }
    2517 
    2518   return ( uiSum );
    2519 }
    2520 
    2521 UInt TComRdCost::xGetSSE32( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    2522 {
    2523   Pel* piOrg   = pcDtParam->pOrg;
    2524   Pel* piCur   = pcDtParam->pCur;
    2525   Pel* piRef   = pRefY;
    2526   Int  iRows   = pcDtParam->iRows;
    2527   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2528   Int  iStrideCur = pcDtParam->iStrideCur;
    2529   Int  iStrideRef =  pcDtParam->iCols;
    2530   Pel  pred;
    2531 
    2532   UInt uiSum = 0;
    2533   Int  iShift = g_uiBitIncrement;
    2534   Int  iOffset = (g_uiBitIncrement>0)? (1<<(g_uiBitIncrement-1)):0;
    2535   Int  iTemp;
    2536 
    2537   for( ; iRows != 0; iRows-- )
    2538   {
    2539     pred = (piCur[0] + piRef[0] + bRound) >> 1;    iTemp = ((piOrg[0]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2540     pred = (piCur[1] + piRef[1] + bRound) >> 1;    iTemp = ((piOrg[1]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2541     pred = (piCur[2] + piRef[2] + bRound) >> 1;    iTemp = ((piOrg[2]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2542     pred = (piCur[3] + piRef[3] + bRound) >> 1;    iTemp = ((piOrg[3]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2543     pred = (piCur[4] + piRef[4] + bRound) >> 1;    iTemp = ((piOrg[4]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2544     pred = (piCur[5] + piRef[5] + bRound) >> 1;    iTemp = ((piOrg[5]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2545     pred = (piCur[6] + piRef[6] + bRound) >> 1;    iTemp = ((piOrg[6]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2546     pred = (piCur[7] + piRef[7] + bRound) >> 1;    iTemp = ((piOrg[7]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2547     pred = (piCur[8] + piRef[8] + bRound) >> 1;    iTemp = ((piOrg[8]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2548     pred = (piCur[9] + piRef[9] + bRound) >> 1;    iTemp = ((piOrg[9]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2549     pred = (piCur[10] + piRef[10] + bRound) >> 1;    iTemp = ((piOrg[10]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2550     pred = (piCur[11] + piRef[11] + bRound) >> 1;    iTemp = ((piOrg[11]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2551     pred = (piCur[12] + piRef[12] + bRound) >> 1;    iTemp = ((piOrg[12]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2552     pred = (piCur[13] + piRef[13] + bRound) >> 1;    iTemp = ((piOrg[13]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2553     pred = (piCur[14] + piRef[14] + bRound) >> 1;    iTemp = ((piOrg[14]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2554     pred = (piCur[15] + piRef[15] + bRound) >> 1;    iTemp = ((piOrg[15]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2555     pred = (piCur[16] + piRef[16] + bRound) >> 1;    iTemp = ((piOrg[16]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2556     pred = (piCur[17] + piRef[17] + bRound) >> 1;    iTemp = ((piOrg[17]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2557     pred = (piCur[18] + piRef[18] + bRound) >> 1;    iTemp = ((piOrg[18]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2558     pred = (piCur[19] + piRef[19] + bRound) >> 1;    iTemp = ((piOrg[19]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2559     pred = (piCur[20] + piRef[20] + bRound) >> 1;    iTemp = ((piOrg[20]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2560     pred = (piCur[21] + piRef[21] + bRound) >> 1;    iTemp = ((piOrg[21]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2561     pred = (piCur[22] + piRef[22] + bRound) >> 1;    iTemp = ((piOrg[22]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2562     pred = (piCur[23] + piRef[23] + bRound) >> 1;    iTemp = ((piOrg[23]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2563     pred = (piCur[24] + piRef[24] + bRound) >> 1;    iTemp = ((piOrg[24]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2564     pred = (piCur[25] + piRef[25] + bRound) >> 1;    iTemp = ((piOrg[25]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2565     pred = (piCur[26] + piRef[26] + bRound) >> 1;    iTemp = ((piOrg[26]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2566     pred = (piCur[27] + piRef[27] + bRound) >> 1;    iTemp = ((piOrg[27]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2567     pred = (piCur[28] + piRef[28] + bRound) >> 1;    iTemp = ((piOrg[28]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2568     pred = (piCur[29] + piRef[29] + bRound) >> 1;    iTemp = ((piOrg[29]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2569     pred = (piCur[30] + piRef[30] + bRound) >> 1;    iTemp = ((piOrg[30]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2570     pred = (piCur[31] + piRef[31] + bRound) >> 1;    iTemp = ((piOrg[31]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2571 
    2572     piOrg += iStrideOrg;
    2573     piCur += iStrideCur;
    2574     piRef += iStrideRef;
    2575   }
    2576 
    2577   return ( uiSum );
    2578 }
    2579 
    2580 UInt TComRdCost::xGetSSE64( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    2581 {
    2582   Pel* piOrg   = pcDtParam->pOrg;
    2583   Pel* piCur   = pcDtParam->pCur;
    2584   Pel* piRef   = pRefY;
    2585   Int  iRows   = pcDtParam->iRows;
    2586   Int  iStrideOrg = pcDtParam->iStrideOrg;
    2587   Int  iStrideCur = pcDtParam->iStrideCur;
    2588   Int  iStrideRef =  pcDtParam->iCols;
    2589   Pel  pred;
    2590 
    2591   UInt uiSum = 0;
    2592   Int  iShift = g_uiBitIncrement;
    2593   Int  iOffset = (g_uiBitIncrement>0)? (1<<(g_uiBitIncrement-1)):0;
    2594   Int  iTemp;
    2595 
    2596   for( ; iRows != 0; iRows-- )
    2597   {
    2598 
    2599     pred = (piCur[0] + piRef[0] + bRound) >> 1;    iTemp = ((piOrg[0]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2600     pred = (piCur[1] + piRef[1] + bRound) >> 1;    iTemp = ((piOrg[1]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2601     pred = (piCur[2] + piRef[2] + bRound) >> 1;    iTemp = ((piOrg[2]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2602     pred = (piCur[3] + piRef[3] + bRound) >> 1;    iTemp = ((piOrg[3]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2603     pred = (piCur[4] + piRef[4] + bRound) >> 1;    iTemp = ((piOrg[4]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2604     pred = (piCur[5] + piRef[5] + bRound) >> 1;    iTemp = ((piOrg[5]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2605     pred = (piCur[6] + piRef[6] + bRound) >> 1;    iTemp = ((piOrg[6]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2606     pred = (piCur[7] + piRef[7] + bRound) >> 1;    iTemp = ((piOrg[7]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2607     pred = (piCur[8] + piRef[8] + bRound) >> 1;    iTemp = ((piOrg[8]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2608     pred = (piCur[9] + piRef[9] + bRound) >> 1;    iTemp = ((piOrg[9]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2609     pred = (piCur[10] + piRef[10] + bRound) >> 1;    iTemp = ((piOrg[10]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2610     pred = (piCur[11] + piRef[11] + bRound) >> 1;    iTemp = ((piOrg[11]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2611     pred = (piCur[12] + piRef[12] + bRound) >> 1;    iTemp = ((piOrg[12]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2612     pred = (piCur[13] + piRef[13] + bRound) >> 1;    iTemp = ((piOrg[13]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2613     pred = (piCur[14] + piRef[14] + bRound) >> 1;    iTemp = ((piOrg[14]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2614     pred = (piCur[15] + piRef[15] + bRound) >> 1;    iTemp = ((piOrg[15]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2615     pred = (piCur[16] + piRef[16] + bRound) >> 1;    iTemp = ((piOrg[16]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2616     pred = (piCur[17] + piRef[17] + bRound) >> 1;    iTemp = ((piOrg[17]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2617     pred = (piCur[18] + piRef[18] + bRound) >> 1;    iTemp = ((piOrg[18]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2618     pred = (piCur[19] + piRef[19] + bRound) >> 1;    iTemp = ((piOrg[19]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2619     pred = (piCur[20] + piRef[20] + bRound) >> 1;    iTemp = ((piOrg[20]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2620     pred = (piCur[21] + piRef[21] + bRound) >> 1;    iTemp = ((piOrg[21]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2621     pred = (piCur[22] + piRef[22] + bRound) >> 1;    iTemp = ((piOrg[22]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2622     pred = (piCur[23] + piRef[23] + bRound) >> 1;    iTemp = ((piOrg[23]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2623     pred = (piCur[24] + piRef[24] + bRound) >> 1;    iTemp = ((piOrg[24]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2624     pred = (piCur[25] + piRef[25] + bRound) >> 1;    iTemp = ((piOrg[25]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2625     pred = (piCur[26] + piRef[26] + bRound) >> 1;    iTemp = ((piOrg[26]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2626     pred = (piCur[27] + piRef[27] + bRound) >> 1;    iTemp = ((piOrg[27]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2627     pred = (piCur[28] + piRef[28] + bRound) >> 1;    iTemp = ((piOrg[28]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2628     pred = (piCur[29] + piRef[29] + bRound) >> 1;    iTemp = ((piOrg[29]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2629 
    2630     pred = (piCur[30] + piRef[30] + bRound) >> 1;    iTemp = ((piOrg[30]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2631     pred = (piCur[31] + piRef[31] + bRound) >> 1;    iTemp = ((piOrg[31]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2632     pred = (piCur[32] + piRef[32] + bRound) >> 1;    iTemp = ((piOrg[32]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2633     pred = (piCur[33] + piRef[33] + bRound) >> 1;    iTemp = ((piOrg[33]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2634     pred = (piCur[34] + piRef[34] + bRound) >> 1;    iTemp = ((piOrg[34]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2635     pred = (piCur[35] + piRef[35] + bRound) >> 1;    iTemp = ((piOrg[35]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2636     pred = (piCur[36] + piRef[36] + bRound) >> 1;    iTemp = ((piOrg[36]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2637     pred = (piCur[37] + piRef[37] + bRound) >> 1;    iTemp = ((piOrg[37]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2638     pred = (piCur[38] + piRef[38] + bRound) >> 1;    iTemp = ((piOrg[38]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2639     pred = (piCur[39] + piRef[39] + bRound) >> 1;    iTemp = ((piOrg[39]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2640 
    2641     pred = (piCur[40] + piRef[40] + bRound) >> 1;    iTemp = ((piOrg[40]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2642     pred = (piCur[41] + piRef[41] + bRound) >> 1;    iTemp = ((piOrg[41]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2643     pred = (piCur[42] + piRef[42] + bRound) >> 1;    iTemp = ((piOrg[42]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2644     pred = (piCur[43] + piRef[43] + bRound) >> 1;    iTemp = ((piOrg[43]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2645     pred = (piCur[44] + piRef[44] + bRound) >> 1;    iTemp = ((piOrg[44]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2646     pred = (piCur[45] + piRef[45] + bRound) >> 1;    iTemp = ((piOrg[45]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2647     pred = (piCur[46] + piRef[46] + bRound) >> 1;    iTemp = ((piOrg[46]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2648     pred = (piCur[47] + piRef[47] + bRound) >> 1;    iTemp = ((piOrg[47]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2649     pred = (piCur[48] + piRef[48] + bRound) >> 1;    iTemp = ((piOrg[48]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2650     pred = (piCur[49] + piRef[49] + bRound) >> 1;    iTemp = ((piOrg[49]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2651 
    2652     pred = (piCur[50] + piRef[50] + bRound) >> 1;    iTemp = ((piOrg[50]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2653     pred = (piCur[51] + piRef[51] + bRound) >> 1;    iTemp = ((piOrg[51]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2654     pred = (piCur[52] + piRef[52] + bRound) >> 1;    iTemp = ((piOrg[52]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2655     pred = (piCur[53] + piRef[53] + bRound) >> 1;    iTemp = ((piOrg[53]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2656     pred = (piCur[54] + piRef[54] + bRound) >> 1;    iTemp = ((piOrg[54]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2657     pred = (piCur[55] + piRef[55] + bRound) >> 1;    iTemp = ((piOrg[55]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2658     pred = (piCur[56] + piRef[56] + bRound) >> 1;    iTemp = ((piOrg[56]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2659     pred = (piCur[57] + piRef[57] + bRound) >> 1;    iTemp = ((piOrg[57]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2660     pred = (piCur[58] + piRef[58] + bRound) >> 1;    iTemp = ((piOrg[58]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2661     pred = (piCur[59] + piRef[59] + bRound) >> 1;    iTemp = ((piOrg[59]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2662 
    2663     pred = (piCur[60] + piRef[60] + bRound) >> 1;    iTemp = ((piOrg[60]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2664     pred = (piCur[61] + piRef[61] + bRound) >> 1;    iTemp = ((piOrg[61]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2665     pred = (piCur[62] + piRef[62] + bRound) >> 1;    iTemp = ((piOrg[62]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2666     pred = (piCur[63] + piRef[63] + bRound) >> 1;    iTemp = ((piOrg[63]+iOffset)>>iShift) - ((pred+iOffset)>>iShift); uiSum += iTemp * iTemp;
    2667 
    2668     piOrg += iStrideOrg;
    2669     piCur += iStrideCur;
    2670     piRef += iStrideRef;
    2671   }
    2672 
    2673   return ( uiSum );
    2674 }
    2675 #endif
    2676 
    2677 
    26781059UInt TComRdCost::xGetSSE( DistParam* pcDtParam )
    26791060{
    2680 #ifdef WEIGHT_PRED
    2681   if ( pcDtParam->applyWeight )
    2682   {
    2683     return xGetSSEw( pcDtParam , pRefY, bRound );
    2684   }
    2685 #endif
    26861061  Pel* piOrg   = pcDtParam->pOrg;
    26871062  Pel* piCur   = pcDtParam->pCur;
     
    27131088UInt TComRdCost::xGetSSE4( DistParam* pcDtParam )
    27141089{
    2715 #ifdef WEIGHT_PRED
    2716   if ( pcDtParam->applyWeight )
    2717   {
    2718     assert( pcDtParam->iCols == 4 );
    2719     return xGetSSEw( pcDtParam , pRefY, bRound );
    2720   }
    2721 #endif
    27221090  Pel* piOrg   = pcDtParam->pOrg;
    27231091  Pel* piCur   = pcDtParam->pCur;
     
    27491117UInt TComRdCost::xGetSSE8( DistParam* pcDtParam )
    27501118{
    2751 #ifdef WEIGHT_PRED
    2752   if ( pcDtParam->applyWeight )
    2753   {
    2754     assert( pcDtParam->iCols == 8 );
    2755     return xGetSSEw( pcDtParam , pRefY, bRound );
    2756   }
    2757 #endif
    27581119  Pel* piOrg   = pcDtParam->pOrg;
    27591120  Pel* piCur   = pcDtParam->pCur;
     
    27881149UInt TComRdCost::xGetSSE16( DistParam* pcDtParam )
    27891150{
    2790 #ifdef WEIGHT_PRED
    2791   if ( pcDtParam->applyWeight )
    2792   {
    2793     assert( pcDtParam->iCols == 16 );
    2794     return xGetSSEw( pcDtParam , pRefY, bRound );
    2795   }
    2796 #endif
    27971151  Pel* piOrg   = pcDtParam->pOrg;
    27981152  Pel* piCur   = pcDtParam->pCur;
     
    28361190UInt TComRdCost::xGetSSE16N( DistParam* pcDtParam )
    28371191{
    2838 #ifdef WEIGHT_PRED
    2839   if ( pcDtParam->applyWeight )
    2840   {
    2841     assert( pcDtParam->iCols == 16 );
    2842     return xGetSSEw( pcDtParam , pRefY, bRound );
    2843   }
    2844 #endif
    28451192  Pel* piOrg   = pcDtParam->pOrg;
    28461193  Pel* piCur   = pcDtParam->pCur;
     
    28871234UInt TComRdCost::xGetSSE32( DistParam* pcDtParam )
    28881235{
    2889 #ifdef WEIGHT_PRED
    2890   if ( pcDtParam->applyWeight )
    2891   {
    2892     assert( pcDtParam->iCols == 32 );
    2893     return xGetSSEw( pcDtParam , pRefY, bRound );
    2894   }
    2895 #endif
    28961236  Pel* piOrg   = pcDtParam->pOrg;
    28971237  Pel* piCur   = pcDtParam->pCur;
     
    29501290UInt TComRdCost::xGetSSE64( DistParam* pcDtParam )
    29511291{
    2952 #ifdef WEIGHT_PRED
    2953   if ( pcDtParam->applyWeight )
    2954   {
    2955     assert( pcDtParam->iCols == 64 );
    2956     return xGetSSEw( pcDtParam , pRefY, bRound );
    2957   }
    2958 #endif
    29591292  Pel* piOrg   = pcDtParam->pOrg;
    29601293  Pel* piCur   = pcDtParam->pCur;
     
    30421375}
    30431376#else
    3044 #ifdef ROUNDING_CONTROL_BIPRED
    3045 UInt TComRdCost::xGetSSE( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    3046 {
    3047   Pel* piOrg   = pcDtParam->pOrg;
    3048   Pel* piCur   = pcDtParam->pCur;
    3049   Pel* piRef   = pRefY;
    3050   Int  iRows   = pcDtParam->iRows;
    3051   Int  iCols   = pcDtParam->iCols;
    3052   Int  iStrideOrg = pcDtParam->iStrideOrg;
    3053   Int  iStrideCur = pcDtParam->iStrideCur;
    3054   Pel  pred;
    3055  
    3056   UInt uiSum = 0;
    3057   UInt uiShift = g_uiBitIncrement<<1;
    3058  
    3059   Int iTemp;
    3060  
    3061   for( ; iRows != 0; iRows-- )
    3062   {
    3063     for (Int n = 0; n < iCols; n++ )
    3064     {
    3065       pred = (piCur[n] + piRef[n] + bRound) >> 1 ;
    3066       iTemp = piOrg[n] - pred;
    3067       uiSum += ( iTemp * iTemp ) >> uiShift;
    3068     }
    3069     piOrg += iStrideOrg;
    3070     piCur += iStrideCur;
    3071     piRef += iCols;
    3072   }
    3073  
    3074   return ( uiSum );
    3075 }
    3076 
    3077 UInt TComRdCost::xGetSSE4( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    3078 {
    3079   Pel* piOrg   = pcDtParam->pOrg;
    3080   Pel* piCur   = pcDtParam->pCur;
    3081   Pel* piRef   = pRefY;
    3082   Int  iRows   = pcDtParam->iRows;
    3083   Int  iStrideOrg = pcDtParam->iStrideOrg;
    3084   Int  iStrideCur = pcDtParam->iStrideCur;
    3085   Int  iStrideRef =  pcDtParam->iCols;
    3086   Pel  pred;
    3087  
    3088   UInt uiSum = 0;
    3089   UInt uiShift = g_uiBitIncrement<<1;
    3090  
    3091   Int  iTemp;
    3092  
    3093   for( ; iRows != 0; iRows-- )
    3094   {
    3095     pred = (piCur[0] + piRef[0] + bRound) >> 1;   iTemp = piOrg[0] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3096     pred = (piCur[1] + piRef[1] + bRound) >> 1;   iTemp = piOrg[1] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3097     pred = (piCur[2] + piRef[2] + bRound) >> 1;   iTemp = piOrg[2] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3098     pred = (piCur[3] + piRef[3] + bRound) >> 1;   iTemp = piOrg[3] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3099    
    3100     piOrg += iStrideOrg;
    3101     piCur += iStrideCur;
    3102     piRef += iStrideRef;
    3103   }
    3104  
    3105   return ( uiSum );
    3106 }
    3107 
    3108 UInt TComRdCost::xGetSSE8( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    3109 {
    3110   Pel* piOrg   = pcDtParam->pOrg;
    3111   Pel* piCur   = pcDtParam->pCur;
    3112   Pel* piRef   = pRefY;
    3113   Int  iRows   = pcDtParam->iRows;
    3114   Int  iStrideOrg = pcDtParam->iStrideOrg;
    3115   Int  iStrideCur = pcDtParam->iStrideCur;
    3116   Int  iStrideRef =  pcDtParam->iCols;
    3117   Pel  pred;
    3118  
    3119   UInt uiSum = 0;
    3120   UInt uiShift = g_uiBitIncrement<<1;
    3121  
    3122   Int  iTemp;
    3123  
    3124   for( ; iRows != 0; iRows-- )
    3125   {
    3126     pred = (piCur[0] + piRef[0] + bRound) >> 1;   iTemp = piOrg[0] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3127     pred = (piCur[1] + piRef[1] + bRound) >> 1;   iTemp = piOrg[1] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3128     pred = (piCur[2] + piRef[2] + bRound) >> 1;   iTemp = piOrg[2] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3129     pred = (piCur[3] + piRef[3] + bRound) >> 1;   iTemp = piOrg[3] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3130     pred = (piCur[4] + piRef[4] + bRound) >> 1;   iTemp = piOrg[4] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3131     pred = (piCur[5] + piRef[5] + bRound) >> 1;   iTemp = piOrg[5] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3132     pred = (piCur[6] + piRef[6] + bRound) >> 1;   iTemp = piOrg[6] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3133     pred = (piCur[7] + piRef[7] + bRound) >> 1;   iTemp = piOrg[7] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3134    
    3135     piOrg += iStrideOrg;
    3136     piCur += iStrideCur;
    3137     piRef += iStrideRef;
    3138   }
    3139  
    3140   return ( uiSum );
    3141 }
    3142 
    3143 UInt TComRdCost::xGetSSE16( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    3144 {
    3145   Pel* piOrg   = pcDtParam->pOrg;
    3146   Pel* piCur   = pcDtParam->pCur;
    3147   Pel* piRef   = pRefY;
    3148   Int  iRows   = pcDtParam->iRows;
    3149   Int  iStrideOrg = pcDtParam->iStrideOrg;
    3150   Int  iStrideCur = pcDtParam->iStrideCur;
    3151   Int  iStrideRef =  pcDtParam->iCols;
    3152   Pel  pred;
    3153  
    3154   UInt uiSum = 0;
    3155   UInt uiShift = g_uiBitIncrement<<1;
    3156  
    3157   Int  iTemp;
    3158  
    3159   for( ; iRows != 0; iRows-- )
    3160   {
    3161     pred = (piCur[0] + piRef[0] + bRound) >> 1;   iTemp = piOrg[0] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3162     pred = (piCur[1] + piRef[1] + bRound) >> 1;   iTemp = piOrg[1] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3163     pred = (piCur[2] + piRef[2] + bRound) >> 1;   iTemp = piOrg[2] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3164     pred = (piCur[3] + piRef[3] + bRound) >> 1;   iTemp = piOrg[3] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3165     pred = (piCur[4] + piRef[4] + bRound) >> 1;   iTemp = piOrg[4] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3166     pred = (piCur[5] + piRef[5] + bRound) >> 1;   iTemp = piOrg[5] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3167     pred = (piCur[6] + piRef[6] + bRound) >> 1;   iTemp = piOrg[6] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3168     pred = (piCur[7] + piRef[7] + bRound) >> 1;   iTemp = piOrg[7] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3169     pred = (piCur[8] + piRef[8] + bRound) >> 1;   iTemp = piOrg[8] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3170     pred = (piCur[9] + piRef[9] + bRound) >> 1;   iTemp = piOrg[9] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3171     pred = (piCur[10] + piRef[10] + bRound) >> 1;   iTemp = piOrg[10] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3172     pred = (piCur[11] + piRef[11] + bRound) >> 1;   iTemp = piOrg[11] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3173     pred = (piCur[12] + piRef[12] + bRound) >> 1;   iTemp = piOrg[12] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3174     pred = (piCur[13] + piRef[13] + bRound) >> 1;   iTemp = piOrg[13] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3175     pred = (piCur[14] + piRef[14] + bRound) >> 1;   iTemp = piOrg[14] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3176     pred = (piCur[15] + piRef[15] + bRound) >> 1;   iTemp = piOrg[15] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3177    
    3178     piOrg += iStrideOrg;
    3179     piCur += iStrideCur;
    3180     piRef += iStrideRef;
    3181   }
    3182  
    3183   return ( uiSum );
    3184 }
    3185 
    3186 UInt TComRdCost::xGetSSE16N( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    3187 {
    3188   Pel* piOrg   = pcDtParam->pOrg;
    3189   Pel* piCur   = pcDtParam->pCur;
    3190   Pel* piRef   = pRefY;
    3191   Int  iRows   = pcDtParam->iRows;
    3192   Int  iCols   = pcDtParam->iCols;
    3193   Int  iStrideOrg = pcDtParam->iStrideOrg;
    3194   Int  iStrideCur = pcDtParam->iStrideCur;
    3195   Pel  pred;
    3196  
    3197   UInt uiSum = 0;
    3198   UInt uiShift = g_uiBitIncrement<<1;
    3199   Int  iTemp;
    3200  
    3201   for( ; iRows != 0; iRows-- )
    3202   {
    3203     for (Int n = 0; n < iCols; n+=16 )
    3204     {
    3205       pred = (piCur[n+ 0] + piRef[n+ 0] + bRound) >> 1;  iTemp = piOrg[n+ 0] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3206       pred = (piCur[n+ 1] + piRef[n+ 1] + bRound) >> 1;  iTemp = piOrg[n+ 1] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3207       pred = (piCur[n+ 2] + piRef[n+ 2] + bRound) >> 1;  iTemp = piOrg[n+ 2] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3208       pred = (piCur[n+ 3] + piRef[n+ 3] + bRound) >> 1;  iTemp = piOrg[n+ 3] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3209       pred = (piCur[n+ 4] + piRef[n+ 4] + bRound) >> 1;  iTemp = piOrg[n+ 4] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3210       pred = (piCur[n+ 5] + piRef[n+ 5] + bRound) >> 1;  iTemp = piOrg[n+ 5] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3211       pred = (piCur[n+ 6] + piRef[n+ 6] + bRound) >> 1;  iTemp = piOrg[n+ 6] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3212       pred = (piCur[n+ 7] + piRef[n+ 7] + bRound) >> 1;  iTemp = piOrg[n+ 7] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3213       pred = (piCur[n+ 8] + piRef[n+ 8] + bRound) >> 1;  iTemp = piOrg[n+ 8] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3214       pred = (piCur[n+ 9] + piRef[n+ 9] + bRound) >> 1;  iTemp = piOrg[n+ 9] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3215       pred = (piCur[n+ 10] + piRef[n+ 10] + bRound) >> 1;  iTemp = piOrg[n+ 10] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3216       pred = (piCur[n+ 11] + piRef[n+ 11] + bRound) >> 1;  iTemp = piOrg[n+ 11] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3217       pred = (piCur[n+ 12] + piRef[n+ 12] + bRound) >> 1;  iTemp = piOrg[n+ 12] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3218       pred = (piCur[n+ 13] + piRef[n+ 13] + bRound) >> 1;  iTemp = piOrg[n+ 13] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3219       pred = (piCur[n+ 14] + piRef[n+ 14] + bRound) >> 1;  iTemp = piOrg[n+ 14] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3220       pred = (piCur[n+ 15] + piRef[n+ 15] + bRound) >> 1;  iTemp = piOrg[n+ 15] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3221     }
    3222     piOrg += iStrideOrg;
    3223     piCur += iStrideCur;
    3224     piRef += iCols;
    3225   }
    3226  
    3227   return ( uiSum );
    3228 }
    3229 
    3230 UInt TComRdCost::xGetSSE32( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    3231 {
    3232   Pel* piOrg   = pcDtParam->pOrg;
    3233   Pel* piCur   = pcDtParam->pCur;
    3234   Pel* piRef   = pRefY;
    3235   Int  iRows   = pcDtParam->iRows;
    3236   Int  iStrideOrg = pcDtParam->iStrideOrg;
    3237   Int  iStrideCur = pcDtParam->iStrideCur;
    3238   Int  iStrideRef =  pcDtParam->iCols;
    3239   Pel  pred;
    3240  
    3241   UInt uiSum = 0;
    3242   UInt uiShift = g_uiBitIncrement<<1;
    3243   Int  iTemp;
    3244  
    3245   for( ; iRows != 0; iRows-- )
    3246   {
    3247     pred = (piCur[0] + piRef[0] + bRound) >> 1;   iTemp = piOrg[0] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3248     pred = (piCur[1] + piRef[1] + bRound) >> 1;   iTemp = piOrg[1] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3249     pred = (piCur[2] + piRef[2] + bRound) >> 1;   iTemp = piOrg[2] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3250     pred = (piCur[3] + piRef[3] + bRound) >> 1;   iTemp = piOrg[3] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3251     pred = (piCur[4] + piRef[4] + bRound) >> 1;   iTemp = piOrg[4] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3252     pred = (piCur[5] + piRef[5] + bRound) >> 1;   iTemp = piOrg[5] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3253     pred = (piCur[6] + piRef[6] + bRound) >> 1;   iTemp = piOrg[6] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3254     pred = (piCur[7] + piRef[7] + bRound) >> 1;   iTemp = piOrg[7] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3255     pred = (piCur[8] + piRef[8] + bRound) >> 1;   iTemp = piOrg[8] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3256     pred = (piCur[9] + piRef[9] + bRound) >> 1;   iTemp = piOrg[9] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3257     pred = (piCur[10] + piRef[10] + bRound) >> 1;   iTemp = piOrg[10] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3258     pred = (piCur[11] + piRef[11] + bRound) >> 1;   iTemp = piOrg[11] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3259     pred = (piCur[12] + piRef[12] + bRound) >> 1;   iTemp = piOrg[12] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3260     pred = (piCur[13] + piRef[13] + bRound) >> 1;   iTemp = piOrg[13] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3261     pred = (piCur[14] + piRef[14] + bRound) >> 1;   iTemp = piOrg[14] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3262     pred = (piCur[15] + piRef[15] + bRound) >> 1;   iTemp = piOrg[15] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3263     pred = (piCur[16] + piRef[16] + bRound) >> 1;   iTemp = piOrg[16] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3264     pred = (piCur[17] + piRef[17] + bRound) >> 1;   iTemp = piOrg[17] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3265     pred = (piCur[18] + piRef[18] + bRound) >> 1;   iTemp = piOrg[18] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3266     pred = (piCur[19] + piRef[19] + bRound) >> 1;   iTemp = piOrg[19] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3267     pred = (piCur[20] + piRef[20] + bRound) >> 1;   iTemp = piOrg[20] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3268     pred = (piCur[21] + piRef[21] + bRound) >> 1;   iTemp = piOrg[21] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3269     pred = (piCur[22] + piRef[22] + bRound) >> 1;   iTemp = piOrg[22] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3270     pred = (piCur[23] + piRef[23] + bRound) >> 1;   iTemp = piOrg[23] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3271     pred = (piCur[24] + piRef[24] + bRound) >> 1;   iTemp = piOrg[24] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3272     pred = (piCur[25] + piRef[25] + bRound) >> 1;   iTemp = piOrg[25] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3273     pred = (piCur[26] + piRef[26] + bRound) >> 1;   iTemp = piOrg[26] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3274     pred = (piCur[27] + piRef[27] + bRound) >> 1;   iTemp = piOrg[27] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3275     pred = (piCur[28] + piRef[28] + bRound) >> 1;   iTemp = piOrg[28] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3276     pred = (piCur[29] + piRef[29] + bRound) >> 1;   iTemp = piOrg[29] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3277     pred = (piCur[30] + piRef[30] + bRound) >> 1;   iTemp = piOrg[30] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3278     pred = (piCur[31] + piRef[31] + bRound) >> 1;   iTemp = piOrg[31] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3279    
    3280     piOrg += iStrideOrg;
    3281     piCur += iStrideCur;
    3282     piRef += iStrideRef;
    3283   }
    3284  
    3285   return ( uiSum );
    3286 }
    3287 
    3288 UInt TComRdCost::xGetSSE64( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    3289 {
    3290   Pel* piOrg   = pcDtParam->pOrg;
    3291   Pel* piCur   = pcDtParam->pCur;
    3292   Pel* piRef   = pRefY;
    3293   Int  iRows   = pcDtParam->iRows;
    3294   Int  iStrideOrg = pcDtParam->iStrideOrg;
    3295   Int  iStrideCur = pcDtParam->iStrideCur;
    3296   Int  iStrideRef =  pcDtParam->iCols;
    3297   Pel  pred;
    3298  
    3299   UInt uiSum = 0;
    3300   UInt uiShift = g_uiBitIncrement<<1;
    3301   Int  iTemp;
    3302  
    3303   for( ; iRows != 0; iRows-- )
    3304   {
    3305    
    3306     pred = (piCur[0] + piRef[0] + bRound) >> 1;   iTemp = piOrg[0] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3307     pred = (piCur[1] + piRef[1] + bRound) >> 1;   iTemp = piOrg[1] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3308     pred = (piCur[2] + piRef[2] + bRound) >> 1;   iTemp = piOrg[2] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3309     pred = (piCur[3] + piRef[3] + bRound) >> 1;   iTemp = piOrg[3] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3310     pred = (piCur[4] + piRef[4] + bRound) >> 1;   iTemp = piOrg[4] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3311     pred = (piCur[5] + piRef[5] + bRound) >> 1;   iTemp = piOrg[5] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3312     pred = (piCur[6] + piRef[6] + bRound) >> 1;   iTemp = piOrg[6] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3313     pred = (piCur[7] + piRef[7] + bRound) >> 1;   iTemp = piOrg[7] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3314     pred = (piCur[8] + piRef[8] + bRound) >> 1;   iTemp = piOrg[8] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3315     pred = (piCur[9] + piRef[9] + bRound) >> 1;   iTemp = piOrg[9] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3316     pred = (piCur[10] + piRef[10] + bRound) >> 1;   iTemp = piOrg[10] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3317     pred = (piCur[11] + piRef[11] + bRound) >> 1;   iTemp = piOrg[11] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3318     pred = (piCur[12] + piRef[12] + bRound) >> 1;   iTemp = piOrg[12] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3319     pred = (piCur[13] + piRef[13] + bRound) >> 1;   iTemp = piOrg[13] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3320     pred = (piCur[14] + piRef[14] + bRound) >> 1;   iTemp = piOrg[14] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3321     pred = (piCur[15] + piRef[15] + bRound) >> 1;   iTemp = piOrg[15] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3322     pred = (piCur[16] + piRef[16] + bRound) >> 1;   iTemp = piOrg[16] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3323     pred = (piCur[17] + piRef[17] + bRound) >> 1;   iTemp = piOrg[17] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3324     pred = (piCur[18] + piRef[18] + bRound) >> 1;   iTemp = piOrg[18] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3325     pred = (piCur[19] + piRef[19] + bRound) >> 1;   iTemp = piOrg[19] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3326     pred = (piCur[20] + piRef[20] + bRound) >> 1;   iTemp = piOrg[20] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3327     pred = (piCur[21] + piRef[21] + bRound) >> 1;   iTemp = piOrg[21] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3328     pred = (piCur[22] + piRef[22] + bRound) >> 1;   iTemp = piOrg[22] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3329     pred = (piCur[23] + piRef[23] + bRound) >> 1;   iTemp = piOrg[23] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3330     pred = (piCur[24] + piRef[24] + bRound) >> 1;   iTemp = piOrg[24] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3331     pred = (piCur[25] + piRef[25] + bRound) >> 1;   iTemp = piOrg[25] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3332     pred = (piCur[26] + piRef[26] + bRound) >> 1;   iTemp = piOrg[26] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3333     pred = (piCur[27] + piRef[27] + bRound) >> 1;   iTemp = piOrg[27] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3334     pred = (piCur[28] + piRef[28] + bRound) >> 1;   iTemp = piOrg[28] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3335     pred = (piCur[29] + piRef[29] + bRound) >> 1;   iTemp = piOrg[29] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3336    
    3337     pred = (piCur[30] + piRef[30] + bRound) >> 1;   iTemp = piOrg[30] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3338     pred = (piCur[31] + piRef[31] + bRound) >> 1;   iTemp = piOrg[31] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3339     pred = (piCur[32] + piRef[32] + bRound) >> 1;   iTemp = piOrg[32] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3340     pred = (piCur[33] + piRef[33] + bRound) >> 1;   iTemp = piOrg[33] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3341     pred = (piCur[34] + piRef[34] + bRound) >> 1;   iTemp = piOrg[34] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3342     pred = (piCur[35] + piRef[35] + bRound) >> 1;   iTemp = piOrg[35] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3343     pred = (piCur[36] + piRef[36] + bRound) >> 1;   iTemp = piOrg[36] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3344     pred = (piCur[37] + piRef[37] + bRound) >> 1;   iTemp = piOrg[37] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3345     pred = (piCur[38] + piRef[38] + bRound) >> 1;   iTemp = piOrg[38] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3346     pred = (piCur[39] + piRef[39] + bRound) >> 1;   iTemp = piOrg[39] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3347    
    3348     pred = (piCur[40] + piRef[40] + bRound) >> 1;   iTemp = piOrg[40] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3349     pred = (piCur[41] + piRef[41] + bRound) >> 1;   iTemp = piOrg[41] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3350     pred = (piCur[42] + piRef[42] + bRound) >> 1;   iTemp = piOrg[42] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3351     pred = (piCur[43] + piRef[43] + bRound) >> 1;   iTemp = piOrg[43] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3352     pred = (piCur[44] + piRef[44] + bRound) >> 1;   iTemp = piOrg[44] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3353     pred = (piCur[45] + piRef[45] + bRound) >> 1;   iTemp = piOrg[45] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3354     pred = (piCur[46] + piRef[46] + bRound) >> 1;   iTemp = piOrg[46] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3355     pred = (piCur[47] + piRef[47] + bRound) >> 1;   iTemp = piOrg[47] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3356     pred = (piCur[48] + piRef[48] + bRound) >> 1;   iTemp = piOrg[48] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3357     pred = (piCur[49] + piRef[49] + bRound) >> 1;   iTemp = piOrg[49] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3358    
    3359     pred = (piCur[50] + piRef[50] + bRound) >> 1;   iTemp = piOrg[50] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3360     pred = (piCur[51] + piRef[51] + bRound) >> 1;   iTemp = piOrg[51] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3361     pred = (piCur[52] + piRef[52] + bRound) >> 1;   iTemp = piOrg[52] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3362     pred = (piCur[53] + piRef[53] + bRound) >> 1;   iTemp = piOrg[53] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3363     pred = (piCur[54] + piRef[54] + bRound) >> 1;   iTemp = piOrg[54] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3364     pred = (piCur[55] + piRef[55] + bRound) >> 1;   iTemp = piOrg[55] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3365     pred = (piCur[56] + piRef[56] + bRound) >> 1;   iTemp = piOrg[56] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3366     pred = (piCur[57] + piRef[57] + bRound) >> 1;   iTemp = piOrg[57] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3367     pred = (piCur[58] + piRef[58] + bRound) >> 1;   iTemp = piOrg[58] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3368     pred = (piCur[59] + piRef[59] + bRound) >> 1;   iTemp = piOrg[59] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3369    
    3370     pred = (piCur[60] + piRef[60] + bRound) >> 1;   iTemp = piOrg[60] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3371     pred = (piCur[61] + piRef[61] + bRound) >> 1;   iTemp = piOrg[61] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3372     pred = (piCur[62] + piRef[62] + bRound) >> 1;   iTemp = piOrg[62] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3373     pred = (piCur[63] + piRef[63] + bRound) >> 1;   iTemp = piOrg[63] - pred; uiSum += ( iTemp * iTemp ) >> uiShift;
    3374    
    3375     piOrg += iStrideOrg;
    3376     piCur += iStrideCur;
    3377     piRef += iStrideRef;
    3378   }
    3379  
    3380   return ( uiSum );
    3381 }
    3382 #endif
    3383 
    33841377UInt TComRdCost::xGetSSE( DistParam* pcDtParam )
    33851378{
    3386 #ifdef WEIGHT_PRED
    3387   if ( pcDtParam->applyWeight )
     1379  if ( pcDtParam->bApplyWeight )
    33881380  {
    33891381    return xGetSSEw( pcDtParam );
    33901382  }
    3391 #endif
    33921383  Pel* piOrg   = pcDtParam->pOrg;
    33931384  Pel* piCur   = pcDtParam->pCur;
     
    34121403    {
    34131404        if( piUsed[n] )
    3414         {
    3415           iTemp = piOrg[n  ] - piCur[n  ];
    3416           uiSum += ( iTemp * iTemp ) >> uiShift;
    3417         }
     1405    {
     1406      iTemp = piOrg[n  ] - piCur[n  ];
     1407      uiSum += ( iTemp * iTemp ) >> uiShift;
     1408    }
    34181409      }
    34191410      piOrg  += iStrideOrg;
     
    34351426    piCur += iStrideCur;
    34361427  }
    3437 #if HHI_INTERVIEW_SKIP
     1428  #if HHI_INTERVIEW_SKIP
    34381429  }
    34391430#endif
     
    34441435UInt TComRdCost::xGetSSE4( DistParam* pcDtParam )
    34451436{
    3446 #ifdef WEIGHT_PRED
    3447   if ( pcDtParam->applyWeight )
     1437  if ( pcDtParam->bApplyWeight )
    34481438  {
    34491439    assert( pcDtParam->iCols == 4 );
    34501440    return xGetSSEw( pcDtParam );
    34511441  }
    3452 #endif
    34531442  Pel* piOrg   = pcDtParam->pOrg;
    34541443  Pel* piCur   = pcDtParam->pCur;
     
    35021491UInt TComRdCost::xGetSSE8( DistParam* pcDtParam )
    35031492{
    3504 #ifdef WEIGHT_PRED
    3505   if ( pcDtParam->applyWeight )
     1493  if ( pcDtParam->bApplyWeight )
    35061494  {
    35071495    assert( pcDtParam->iCols == 8 );
    35081496    return xGetSSEw( pcDtParam );
    35091497  }
    3510 #endif
    35111498  Pel* piOrg   = pcDtParam->pOrg;
    35121499  Pel* piCur   = pcDtParam->pCur;
     
    35671554UInt TComRdCost::xGetSSE16( DistParam* pcDtParam )
    35681555{
    3569 #ifdef WEIGHT_PRED
    3570   if ( pcDtParam->applyWeight )
     1556  if ( pcDtParam->bApplyWeight )
    35711557  {
    35721558    assert( pcDtParam->iCols == 16 );
    35731559    return xGetSSEw( pcDtParam );
    35741560  }
    3575 #endif
    35761561  Pel* piOrg   = pcDtParam->pOrg;
    35771562  Pel* piCur   = pcDtParam->pCur;
     
    36491634UInt TComRdCost::xGetSSE16N( DistParam* pcDtParam )
    36501635{
    3651 #ifdef WEIGHT_PRED
    3652   if ( pcDtParam->applyWeight )
     1636  if ( pcDtParam->bApplyWeight )
    36531637  {
    36541638    return xGetSSEw( pcDtParam );
    36551639  }
    3656 #endif
    36571640  Pel* piOrg   = pcDtParam->pOrg;
    36581641  Pel* piCur   = pcDtParam->pCur;
     
    37351718UInt TComRdCost::xGetSSE32( DistParam* pcDtParam )
    37361719{
    3737 #ifdef WEIGHT_PRED
    3738   if ( pcDtParam->applyWeight )
     1720  if ( pcDtParam->bApplyWeight )
    37391721  {
    37401722    assert( pcDtParam->iCols == 32 );
    37411723    return xGetSSEw( pcDtParam );
    37421724  }
    3743 #endif
    37441725  Pel* piOrg   = pcDtParam->pOrg;
    37451726  Pel* piCur   = pcDtParam->pCur;
     
    38481829UInt TComRdCost::xGetSSE64( DistParam* pcDtParam )
    38491830{
    3850 #ifdef WEIGHT_PRED
    3851   if ( pcDtParam->applyWeight )
     1831  if ( pcDtParam->bApplyWeight )
    38521832  {
    38531833    assert( pcDtParam->iCols == 64 );
    38541834    return xGetSSEw( pcDtParam );
    38551835  }
    3856 #endif
    38571836  Pel* piOrg   = pcDtParam->pOrg;
    38581837  Pel* piCur   = pcDtParam->pCur;
     
    40272006// --------------------------------------------------------------------------------------------------------------------
    40282007
    4029 #ifdef ROUNDING_CONTROL_BIPRED
    4030 
    4031 UInt TComRdCost::xCalcHADs2x2( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep, Pel* pRefY, Int refYStride, Bool bRound )
     2008UInt TComRdCost::xCalcHADs2x2( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
    40322009{
    40332010  Int satd = 0, diff[4], m[4];
    4034   Pel pred;
    4035  
    4036   pred = ( (piCur[0] + pRefY[0] + bRound) >> 1);
    4037   diff[0] = (piOrg[0             ] - pred) << 1;
    4038   pred = ( (piCur[iStep] + pRefY[1] + bRound) >> 1);
    4039   diff[1] = (piOrg[1             ] - pred) << 1;
    4040   pred = ( (piCur[iStrideCur] + pRefY[refYStride] + bRound) >> 1);
    4041   diff[2] = (piOrg[iStrideOrg    ] - pred) << 1;
    4042   pred = ( (piCur[iStep + iStrideCur] + pRefY[refYStride + 1] + bRound) >> 1);
    4043   diff[3] = (piOrg[iStrideOrg + 1] - pred) << 1;
    4044  
     2011  assert( iStep == 1 );
     2012  diff[0] = piOrg[0             ] - piCur[0];
     2013  diff[1] = piOrg[1             ] - piCur[1];
     2014  diff[2] = piOrg[iStrideOrg    ] - piCur[0 + iStrideCur];
     2015  diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
    40452016  m[0] = diff[0] + diff[2];
    40462017  m[1] = diff[1] + diff[3];
     
    40562027}
    40572028
    4058 UInt TComRdCost::xCalcHADs4x4( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep, Pel* pRefY, Int refYStride, Bool bRound )
     2029UInt TComRdCost::xCalcHADs4x4( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
    40592030{
    40602031  Int k, satd = 0, diff[16], m[16], d[16];
    4061   Pel pred;
    4062   Pel* piRef = pRefY;
    4063  
     2032 
     2033  assert( iStep == 1 );
    40642034  for( k = 0; k < 16; k+=4 )
    40652035  {
    4066     pred = ( (piCur[0*iStep] + piRef[0] + bRound) >> 1);
    4067     diff[k+0] = (piOrg[0] - pred) << 1;
    4068     pred = ( (piCur[1*iStep] + piRef[1] + bRound) >> 1);
    4069     diff[k+1] = (piOrg[1] - pred) << 1;
    4070     pred = ( (piCur[2*iStep] + piRef[2] + bRound) >> 1);
    4071     diff[k+2] = (piOrg[2] - pred) << 1;
    4072     pred = ( (piCur[3*iStep] + piRef[3] + bRound) >> 1);
    4073     diff[k+3] = (piOrg[3] - pred) << 1;
    4074 
     2036    diff[k+0] = piOrg[0] - piCur[0];
     2037    diff[k+1] = piOrg[1] - piCur[1];
     2038    diff[k+2] = piOrg[2] - piCur[2];
     2039    diff[k+3] = piOrg[3] - piCur[3];
     2040   
    40752041    piCur += iStrideCur;
    40762042    piOrg += iStrideOrg;
    4077     piRef += refYStride;
    40782043  }
    40792044 
     
    41562121}
    41572122
    4158 UInt TComRdCost::xCalcHADs8x8( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep, Pel* pRefY, Int refYStride, Bool bRound )
     2123UInt TComRdCost::xCalcHADs8x8( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
    41592124{
    41602125  Int k, i, j, jj, sad=0;
    41612126  Int diff[64], m1[8][8], m2[8][8], m3[8][8];
    4162   Pel pred;
    4163   Pel* piRef = pRefY;
    4164   Int iStep2 = iStep<<1;
    4165   Int iStep3 = iStep2 + iStep;
    4166   Int iStep4 = iStep3 + iStep;
    4167   Int iStep5 = iStep4 + iStep;
    4168   Int iStep6 = iStep5 + iStep;
    4169   Int iStep7 = iStep6 + iStep;
    4170  
    4171   for( k = 0; k < 64; k+=8 )
    4172   {
    4173     pred = ( (piCur[0     ] + piRef[0] + bRound) >> 1 );  diff[k  ] = (piOrg[0] - pred) << 1;
    4174     pred = ( (piCur[iStep ] + piRef[1] + bRound) >> 1 );  diff[k+1] = (piOrg[1] - pred) << 1;
    4175     pred = ( (piCur[iStep2] + piRef[2] + bRound) >> 1 );  diff[k+2] = (piOrg[2] - pred) << 1;
    4176     pred = ( (piCur[iStep3] + piRef[3] + bRound) >> 1 );  diff[k+3] = (piOrg[3] - pred) << 1;
    4177     pred = ( (piCur[iStep4] + piRef[4] + bRound) >> 1 );  diff[k+4] = (piOrg[4] - pred) << 1;
    4178     pred = ( (piCur[iStep5] + piRef[5] + bRound) >> 1 );  diff[k+5] = (piOrg[5] - pred) << 1;
    4179     pred = ( (piCur[iStep6] + piRef[6] + bRound) >> 1 );  diff[k+6] = (piOrg[6] - pred) << 1;
    4180     pred = ( (piCur[iStep7] + piRef[7] + bRound) >> 1 );  diff[k+7] = (piOrg[7] - pred) << 1;
     2127  assert( iStep == 1 );
     2128  for( k = 0; k < 64; k += 8 )
     2129  {
     2130    diff[k+0] = piOrg[0] - piCur[0];
     2131    diff[k+1] = piOrg[1] - piCur[1];
     2132    diff[k+2] = piOrg[2] - piCur[2];
     2133    diff[k+3] = piOrg[3] - piCur[3];
     2134    diff[k+4] = piOrg[4] - piCur[4];
     2135    diff[k+5] = piOrg[5] - piCur[5];
     2136    diff[k+6] = piOrg[6] - piCur[6];
     2137    diff[k+7] = piOrg[7] - piCur[7];
     2138   
    41812139    piCur += iStrideCur;
    41822140    piOrg += iStrideOrg;
    4183     piRef += refYStride;
    4184   }
     2141  }
     2142 
    41852143  //horizontal
    41862144  for (j=0; j < 8; j++)
     
    42452203    m2[7][i] = m1[6][i] - m1[7][i];
    42462204  }
    4247   for (j=0; j < 8; j++)
    4248     for (i=0; i < 8; i++)
    4249       sad += (abs(m2[j][i]));
     2205 
     2206  for (i = 0; i < 8; i++)
     2207  {
     2208    for (j = 0; j < 8; j++)
     2209    {
     2210      sad += abs(m2[i][j]);
     2211    }
     2212  }
    42502213 
    42512214  sad=((sad+2)>>2);
     
    42542217}
    42552218
    4256 UInt TComRdCost::xGetHADs4( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    4257 {
    4258   Pel* piOrg   = pcDtParam->pOrg;
    4259   Pel* piCur   = pcDtParam->pCur;
    4260   Pel* piRef   = pRefY;
    4261   Int  iRows   = pcDtParam->iRows;
    4262   Int  iStrideCur = pcDtParam->iStrideCur;
    4263   Int  iStrideOrg = pcDtParam->iStrideOrg;
    4264   Int  iStep  = pcDtParam->iStep;
    4265   Int  y;
    4266   Int  iOffsetOrg = iStrideOrg<<2;
    4267   Int  iOffsetCur = iStrideCur<<2;
    4268  
    4269   UInt uiSum = 0;
    4270  
    4271   for ( y=0; y<iRows; y+= 4 )
    4272   {
    4273     uiSum += xCalcHADs4x4( piOrg, piCur, iStrideOrg, iStrideCur, iStep, piRef, pcDtParam->iCols, bRound );
    4274     piOrg += iOffsetOrg;
    4275     piCur += iOffsetCur;
    4276     piRef += (pcDtParam->iCols << 2);
    4277   }
    4278  
    4279   return ( uiSum >> g_uiBitIncrement );
    4280 }
    4281 
    4282 UInt TComRdCost::xGetHADs8( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    4283 {
    4284   Pel* piOrg   = pcDtParam->pOrg;
    4285   Pel* piCur   = pcDtParam->pCur;
    4286   Pel* piRef   = pRefY;
    4287   Int  iRows   = pcDtParam->iRows;
    4288   Int  iStrideCur = pcDtParam->iStrideCur;
    4289   Int  iStrideOrg = pcDtParam->iStrideOrg;
    4290   Int  iStep  = pcDtParam->iStep;
    4291   Int  y;
    4292  
    4293   UInt uiSum = 0;
    4294  
    4295   if ( iRows == 4 ) // 8x4 case
    4296   {
    4297     uiSum += xCalcHADs4x4( piOrg+0, piCur        , iStrideOrg, iStrideCur, iStep, piRef, pcDtParam->iCols, bRound );
    4298     uiSum += xCalcHADs4x4( piOrg+4, piCur+4*iStep, iStrideOrg, iStrideCur, iStep, piRef+4, pcDtParam->iCols, bRound );
    4299   }
    4300   else
    4301   {
    4302     Int  iOffsetOrg = iStrideOrg<<3;
    4303     Int  iOffsetCur = iStrideCur<<3;
    4304     for ( y=0; y<iRows; y+= 8 )
    4305     {
    4306       uiSum += xCalcHADs8x8( piOrg, piCur, iStrideOrg, iStrideCur, iStep, piRef, pcDtParam->iCols, bRound );
    4307       piOrg += iOffsetOrg;
    4308       piCur += iOffsetCur;
    4309       piRef += (pcDtParam->iCols << 3);
    4310     }
    4311   }
    4312  
    4313   return ( uiSum >> g_uiBitIncrement );
    4314 }
    4315 
    4316 UInt TComRdCost::xGetHADs( DistParam* pcDtParam, Pel* pRefY, Bool bRound )
    4317 {
    4318   Pel* piOrg   = pcDtParam->pOrg;
    4319   Pel* piCur   = pcDtParam->pCur;
    4320   Pel* piRef   = pRefY;
    4321   Int  iRows   = pcDtParam->iRows;
    4322   Int  iCols   = pcDtParam->iCols;
    4323   Int  iStrideCur = pcDtParam->iStrideCur;
    4324   Int  iStrideOrg = pcDtParam->iStrideOrg;
    4325   Int  iStep  = pcDtParam->iStep;
    4326  
    4327   Int  x, y;
    4328  
    4329   UInt uiSum = 0;
    4330  
    4331   if( ( iRows % 8 == 0) && (iCols % 8 == 0) )
    4332   {
    4333     Int  iOffsetOrg = iStrideOrg<<3;
    4334     Int  iOffsetCur = iStrideCur<<3;
    4335     for ( y=0; y<iRows; y+= 8 )
    4336     {
    4337       for ( x=0; x<iCols; x+= 8 )  // do HAD over 8xiCols pixels
    4338       {
    4339         uiSum += xCalcHADs8x8( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep, &piRef[x], iCols, bRound );
    4340       }
    4341       piOrg += iOffsetOrg;
    4342       piCur += iOffsetCur;
    4343       piRef += (iCols << 3);
    4344     }
    4345   }
    4346   else if( ( iRows % 4 == 0) && (iCols % 4 == 0) )
    4347   {
    4348     Int  iOffsetOrg = iStrideOrg<<2;
    4349     Int  iOffsetCur = iStrideCur<<2;
    4350    
    4351     for ( y=0; y<iRows; y+= 4 )
    4352     {
    4353       for ( x=0; x<iCols; x+= 4 ) // do HAD over 4xiCols pixels
    4354       {
    4355         uiSum += xCalcHADs4x4( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep, &piRef[x], iCols, bRound );
    4356       }
    4357       piOrg += iOffsetOrg;
    4358       piCur += iOffsetCur;
    4359       piRef += (iCols << 2);
    4360     }
    4361   }
    4362   else
    4363   {
    4364     for ( y=0; y<iRows; y+=2 )
    4365     {
    4366       for ( x=0; x<iCols; x+=2 )// do HAD over 2xiCols pixels
    4367       {
    4368         uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep, &piRef[x], iCols, bRound );
    4369       }
    4370       piOrg += iStrideOrg;  // should this be (iStrideOrg << 1) ?
    4371       piCur += iStrideCur;
    4372       piRef += iCols;
    4373     }
    4374   }
    4375  
    4376   return ( uiSum >> g_uiBitIncrement );
    4377 }
    4378 
    4379 #endif
    4380 
    4381 UInt TComRdCost::xCalcHADs2x2( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
    4382 {
    4383   Int satd = 0, diff[4], m[4];
    4384   diff[0] = piOrg[0             ] - piCur[0*iStep];
    4385   diff[1] = piOrg[1             ] - piCur[1*iStep];
    4386   diff[2] = piOrg[iStrideOrg    ] - piCur[0*iStep + iStrideCur];
    4387   diff[3] = piOrg[iStrideOrg + 1] - piCur[1*iStep + iStrideCur];
    4388  
    4389   m[0] = diff[0] + diff[2];
    4390   m[1] = diff[1] + diff[3];
    4391   m[2] = diff[0] - diff[2];
    4392   m[3] = diff[1] - diff[3];
    4393  
    4394   satd += abs(m[0] + m[1]);
    4395   satd += abs(m[0] - m[1]);
    4396   satd += abs(m[2] + m[3]);
    4397   satd += abs(m[2] - m[3]);
    4398  
    4399   return satd;
    4400 }
    4401 
    4402 UInt TComRdCost::xCalcHADs4x4( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
    4403 {
    4404   Int k, satd = 0, diff[16], m[16], d[16];
    4405  
    4406   for( k = 0; k < 16; k+=4 )
    4407   {
    4408     diff[k+0] = piOrg[0] - piCur[0*iStep];
    4409     diff[k+1] = piOrg[1] - piCur[1*iStep];
    4410     diff[k+2] = piOrg[2] - piCur[2*iStep];
    4411     diff[k+3] = piOrg[3] - piCur[3*iStep];
    4412    
     2219#if NS_HAD
     2220UInt TComRdCost::xCalcHADs16x4( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
     2221{
     2222  Int k, i, j, jj, sad=0;
     2223  Int diff[64], m1[4][16], m2[4][16];
     2224  assert( iStep == 1 );
     2225  for( k = 0; k < 64; k += 16 )
     2226  {
     2227    diff[k+0] = piOrg[0] - piCur[0];
     2228    diff[k+1] = piOrg[1] - piCur[1];
     2229    diff[k+2] = piOrg[2] - piCur[2];
     2230    diff[k+3] = piOrg[3] - piCur[3];
     2231    diff[k+4] = piOrg[4] - piCur[4];
     2232    diff[k+5] = piOrg[5] - piCur[5];
     2233    diff[k+6] = piOrg[6] - piCur[6];
     2234    diff[k+7] = piOrg[7] - piCur[7];
     2235
     2236    diff[k+8]  = piOrg[8]  - piCur[8] ;
     2237    diff[k+9]  = piOrg[9]  - piCur[9] ;
     2238    diff[k+10] = piOrg[10] - piCur[10];
     2239    diff[k+11] = piOrg[11] - piCur[11];
     2240    diff[k+12] = piOrg[12] - piCur[12];
     2241    diff[k+13] = piOrg[13] - piCur[13];
     2242    diff[k+14] = piOrg[14] - piCur[14];
     2243    diff[k+15] = piOrg[15] - piCur[15];
     2244
    44132245    piCur += iStrideCur;
    44142246    piOrg += iStrideOrg;
    44152247  }
    4416  
    4417   /*===== hadamard transform =====*/
    4418   m[ 0] = diff[ 0] + diff[12];
    4419   m[ 1] = diff[ 1] + diff[13];
    4420   m[ 2] = diff[ 2] + diff[14];
    4421   m[ 3] = diff[ 3] + diff[15];
    4422   m[ 4] = diff[ 4] + diff[ 8];
    4423   m[ 5] = diff[ 5] + diff[ 9];
    4424   m[ 6] = diff[ 6] + diff[10];
    4425   m[ 7] = diff[ 7] + diff[11];
    4426   m[ 8] = diff[ 4] - diff[ 8];
    4427   m[ 9] = diff[ 5] - diff[ 9];
    4428   m[10] = diff[ 6] - diff[10];
    4429   m[11] = diff[ 7] - diff[11];
    4430   m[12] = diff[ 0] - diff[12];
    4431   m[13] = diff[ 1] - diff[13];
    4432   m[14] = diff[ 2] - diff[14];
    4433   m[15] = diff[ 3] - diff[15];
    4434  
    4435   d[ 0] = m[ 0] + m[ 4];
    4436   d[ 1] = m[ 1] + m[ 5];
    4437   d[ 2] = m[ 2] + m[ 6];
    4438   d[ 3] = m[ 3] + m[ 7];
    4439   d[ 4] = m[ 8] + m[12];
    4440   d[ 5] = m[ 9] + m[13];
    4441   d[ 6] = m[10] + m[14];
    4442   d[ 7] = m[11] + m[15];
    4443   d[ 8] = m[ 0] - m[ 4];
    4444   d[ 9] = m[ 1] - m[ 5];
    4445   d[10] = m[ 2] - m[ 6];
    4446   d[11] = m[ 3] - m[ 7];
    4447   d[12] = m[12] - m[ 8];
    4448   d[13] = m[13] - m[ 9];
    4449   d[14] = m[14] - m[10];
    4450   d[15] = m[15] - m[11];
    4451  
    4452   m[ 0] = d[ 0] + d[ 3];
    4453   m[ 1] = d[ 1] + d[ 2];
    4454   m[ 2] = d[ 1] - d[ 2];
    4455   m[ 3] = d[ 0] - d[ 3];
    4456   m[ 4] = d[ 4] + d[ 7];
    4457   m[ 5] = d[ 5] + d[ 6];
    4458   m[ 6] = d[ 5] - d[ 6];
    4459   m[ 7] = d[ 4] - d[ 7];
    4460   m[ 8] = d[ 8] + d[11];
    4461   m[ 9] = d[ 9] + d[10];
    4462   m[10] = d[ 9] - d[10];
    4463   m[11] = d[ 8] - d[11];
    4464   m[12] = d[12] + d[15];
    4465   m[13] = d[13] + d[14];
    4466   m[14] = d[13] - d[14];
    4467   m[15] = d[12] - d[15];
    4468  
    4469   d[ 0] = m[ 0] + m[ 1];
    4470   d[ 1] = m[ 0] - m[ 1];
    4471   d[ 2] = m[ 2] + m[ 3];
    4472   d[ 3] = m[ 3] - m[ 2];
    4473   d[ 4] = m[ 4] + m[ 5];
    4474   d[ 5] = m[ 4] - m[ 5];
    4475   d[ 6] = m[ 6] + m[ 7];
    4476   d[ 7] = m[ 7] - m[ 6];
    4477   d[ 8] = m[ 8] + m[ 9];
    4478   d[ 9] = m[ 8] - m[ 9];
    4479   d[10] = m[10] + m[11];
    4480   d[11] = m[11] - m[10];
    4481   d[12] = m[12] + m[13];
    4482   d[13] = m[12] - m[13];
    4483   d[14] = m[14] + m[15];
    4484   d[15] = m[15] - m[14];
    4485  
    4486   for (k=0; k<16; ++k)
    4487   {
    4488     satd += abs(d[k]);
    4489   }
    4490   satd = ((satd+1)>>1);
    4491  
    4492   return satd;
    4493 }
    4494 
    4495 UInt TComRdCost::xCalcHADs8x8( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
     2248
     2249  //horizontal
     2250  for (j=0; j < 4; j++)
     2251  {
     2252    jj = j << 4;
     2253
     2254    m2[j][0]  = diff[jj  ] + diff[jj+8];
     2255    m2[j][1]  = diff[jj+1] + diff[jj+9];
     2256    m2[j][2]  = diff[jj+2] + diff[jj+10];
     2257    m2[j][3]  = diff[jj+3] + diff[jj+11];
     2258    m2[j][4]  = diff[jj+4] + diff[jj+12];
     2259    m2[j][5]  = diff[jj+5] + diff[jj+13];
     2260    m2[j][6]  = diff[jj+6] + diff[jj+14];
     2261    m2[j][7]  = diff[jj+7] + diff[jj+15];
     2262    m2[j][8]  = diff[jj  ] - diff[jj+8];
     2263    m2[j][9]  = diff[jj+1] - diff[jj+9];
     2264    m2[j][10] = diff[jj+2] - diff[jj+10];
     2265    m2[j][11] = diff[jj+3] - diff[jj+11];
     2266    m2[j][12] = diff[jj+4] - diff[jj+12];
     2267    m2[j][13] = diff[jj+5] - diff[jj+13];
     2268    m2[j][14] = diff[jj+6] - diff[jj+14];
     2269    m2[j][15] = diff[jj+7] - diff[jj+15];
     2270
     2271    m1[j][0]  = m2[j][0]  + m2[j][4];
     2272    m1[j][1]  = m2[j][1]  + m2[j][5];
     2273    m1[j][2]  = m2[j][2]  + m2[j][6];
     2274    m1[j][3]  = m2[j][3]  + m2[j][7];
     2275    m1[j][4]  = m2[j][0]  - m2[j][4];
     2276    m1[j][5]  = m2[j][1]  - m2[j][5];
     2277    m1[j][6]  = m2[j][2]  - m2[j][6];
     2278    m1[j][7]  = m2[j][3]  - m2[j][7];
     2279    m1[j][8]  = m2[j][8]  + m2[j][12];
     2280    m1[j][9]  = m2[j][9]  + m2[j][13];
     2281    m1[j][10] = m2[j][10] + m2[j][14];
     2282    m1[j][11] = m2[j][11] + m2[j][15];
     2283    m1[j][12] = m2[j][8]  - m2[j][12];
     2284    m1[j][13] = m2[j][9]  - m2[j][13];
     2285    m1[j][14] = m2[j][10] - m2[j][14];
     2286    m1[j][15] = m2[j][11] - m2[j][15];
     2287
     2288    m2[j][0]  = m1[j][0]  + m1[j][2];
     2289    m2[j][1]  = m1[j][1]  + m1[j][3];
     2290    m2[j][2]  = m1[j][0]  - m1[j][2];
     2291    m2[j][3]  = m1[j][1]  - m1[j][3];
     2292    m2[j][4]  = m1[j][4]  + m1[j][6];
     2293    m2[j][5]  = m1[j][5]  + m1[j][7];
     2294    m2[j][6]  = m1[j][4]  - m1[j][6];
     2295    m2[j][7]  = m1[j][5]  - m1[j][7];
     2296    m2[j][8]  = m1[j][8]  + m1[j][10];
     2297    m2[j][9]  = m1[j][9]  + m1[j][11];
     2298    m2[j][10] = m1[j][8]  - m1[j][10];
     2299    m2[j][11] = m1[j][9]  - m1[j][11];
     2300    m2[j][12] = m1[j][12] + m1[j][14];
     2301    m2[j][13] = m1[j][13] + m1[j][15];
     2302    m2[j][14] = m1[j][12] - m1[j][14];
     2303    m2[j][15] = m1[j][13] - m1[j][15];
     2304
     2305    m1[j][0]  = m2[j][0]  + m2[j][1];
     2306    m1[j][1]  = m2[j][0]  - m2[j][1];
     2307    m1[j][2]  = m2[j][2]  + m2[j][3];
     2308    m1[j][3]  = m2[j][2]  - m2[j][3];
     2309    m1[j][4]  = m2[j][4]  + m2[j][5];
     2310    m1[j][5]  = m2[j][4]  - m2[j][5];
     2311    m1[j][6]  = m2[j][6]  + m2[j][7];
     2312    m1[j][7]  = m2[j][6]  - m2[j][7];
     2313    m1[j][8]  = m2[j][8]  + m2[j][9];
     2314    m1[j][9]  = m2[j][8]  - m2[j][9];
     2315    m1[j][10] = m2[j][10] + m2[j][11];
     2316    m1[j][11] = m2[j][10] - m2[j][11];
     2317    m1[j][12] = m2[j][12] + m2[j][13];
     2318    m1[j][13] = m2[j][12] - m2[j][13];
     2319    m1[j][14] = m2[j][14] + m2[j][15];
     2320    m1[j][15] = m2[j][14] - m2[j][15];
     2321  }
     2322
     2323  //vertical
     2324  for (i=0; i < 16; i++)
     2325  {   
     2326    m2[0][i] = m1[0][i] + m1[2][i];
     2327    m2[1][i] = m1[1][i] + m1[3][i];
     2328    m2[2][i] = m1[0][i] - m1[2][i];
     2329    m2[3][i] = m1[1][i] - m1[3][i];
     2330
     2331    m1[0][i] = m2[0][i] + m2[1][i];
     2332    m1[1][i] = m2[0][i] - m2[1][i];
     2333    m1[2][i] = m2[2][i] + m2[3][i];
     2334    m1[3][i] = m2[2][i] - m2[3][i];
     2335  }
     2336
     2337  for (i = 0; i < 4; i++)
     2338  {
     2339    for (j = 0; j < 16; j++)
     2340    {
     2341      sad += abs(m1[i][j]);
     2342    }
     2343  }
     2344
     2345  sad=((sad+2)>>2);
     2346
     2347  return sad;
     2348}
     2349
     2350UInt TComRdCost::xCalcHADs4x16( Pel *piOrg, Pel *piCur, Int iStrideOrg, Int iStrideCur, Int iStep )
    44962351{
    44972352  Int k, i, j, jj, sad=0;
    4498   Int diff[64], m1[8][8], m2[8][8], m3[8][8];
    4499   Int iStep2 = iStep<<1;
    4500   Int iStep3 = iStep2 + iStep;
    4501   Int iStep4 = iStep3 + iStep;
    4502   Int iStep5 = iStep4 + iStep;
    4503   Int iStep6 = iStep5 + iStep;
    4504   Int iStep7 = iStep6 + iStep;
    4505  
    4506   for( k = 0; k < 64; k+=8 )
    4507   {
    4508     diff[k+0] = piOrg[0] - piCur[     0];
    4509     diff[k+1] = piOrg[1] - piCur[iStep ];
    4510     diff[k+2] = piOrg[2] - piCur[iStep2];
    4511     diff[k+3] = piOrg[3] - piCur[iStep3];
    4512     diff[k+4] = piOrg[4] - piCur[iStep4];
    4513     diff[k+5] = piOrg[5] - piCur[iStep5];
    4514     diff[k+6] = piOrg[6] - piCur[iStep6];
    4515     diff[k+7] = piOrg[7] - piCur[iStep7];
    4516    
     2353  Int diff[64], m1[16][4], m2[16][4], m3[16][4];
     2354  assert( iStep == 1 );
     2355  for( k = 0; k < 64; k += 4 )
     2356  {
     2357    diff[k+0] = piOrg[0] - piCur[0];
     2358    diff[k+1] = piOrg[1] - piCur[1];
     2359    diff[k+2] = piOrg[2] - piCur[2];
     2360    diff[k+3] = piOrg[3] - piCur[3];
     2361
    45172362    piCur += iStrideCur;
    45182363    piOrg += iStrideOrg;
    45192364  }
    4520  
     2365
    45212366  //horizontal
    4522   for (j=0; j < 8; j++)
    4523   {
    4524     jj = j << 3;
    4525     m2[j][0] = diff[jj  ] + diff[jj+4];
    4526     m2[j][1] = diff[jj+1] + diff[jj+5];
    4527     m2[j][2] = diff[jj+2] + diff[jj+6];
    4528     m2[j][3] = diff[jj+3] + diff[jj+7];
    4529     m2[j][4] = diff[jj  ] - diff[jj+4];
    4530     m2[j][5] = diff[jj+1] - diff[jj+5];
    4531     m2[j][6] = diff[jj+2] - diff[jj+6];
    4532     m2[j][7] = diff[jj+3] - diff[jj+7];
    4533    
    4534     m1[j][0] = m2[j][0] + m2[j][2];
    4535     m1[j][1] = m2[j][1] + m2[j][3];
    4536     m1[j][2] = m2[j][0] - m2[j][2];
    4537     m1[j][3] = m2[j][1] - m2[j][3];
    4538     m1[j][4] = m2[j][4] + m2[j][6];
    4539     m1[j][5] = m2[j][5] + m2[j][7];
    4540     m1[j][6] = m2[j][4] - m2[j][6];
    4541     m1[j][7] = m2[j][5] - m2[j][7];
    4542    
    4543     m2[j][0] = m1[j][0] + m1[j][1];
    4544     m2[j][1] = m1[j][0] - m1[j][1];
    4545     m2[j][2] = m1[j][2] + m1[j][3];
    4546     m2[j][3] = m1[j][2] - m1[j][3];
    4547     m2[j][4] = m1[j][4] + m1[j][5];
    4548     m2[j][5] = m1[j][4] - m1[j][5];
    4549     m2[j][6] = m1[j][6] + m1[j][7];
    4550     m2[j][7] = m1[j][6] - m1[j][7];
    4551   }
    4552  
     2367  for (j=0; j < 16; j++)
     2368  {
     2369    jj = j << 2;
     2370    m2[j][0] = diff[jj  ] + diff[jj+2];
     2371    m2[j][1] = diff[jj+1] + diff[jj+3];
     2372    m2[j][2] = diff[jj  ] - diff[jj+2];
     2373    m2[j][3] = diff[jj+1] - diff[jj+3];
     2374
     2375    m1[j][0] = m2[j][0] + m2[j][1];
     2376    m1[j][1] = m2[j][0] - m2[j][1];
     2377    m1[j][2] = m2[j][2] + m2[j][3];
     2378    m1[j][3] = m2[j][2] - m2[j][3];
     2379  }
     2380
    45532381  //vertical
    4554   for (i=0; i < 8; i++)
    4555   {
    4556     m3[0][i] = m2[0][i] + m2[4][i];
    4557     m3[1][i] = m2[1][i] + m2[5][i];
    4558     m3[2][i] = m2[2][i] + m2[6][i];
    4559     m3[3][i] = m2[3][i] + m2[7][i];
    4560     m3[4][i] = m2[0][i] - m2[4][i];
    4561     m3[5][i] = m2[1][i] - m2[5][i];
    4562     m3[6][i] = m2[2][i] - m2[6][i];
    4563     m3[7][i] = m2[3][i] - m2[7][i];
    4564    
    4565     m1[0][i] = m3[0][i] + m3[2][i];
    4566     m1[1][i] = m3[1][i] + m3[3][i];
    4567     m1[2][i] = m3[0][i] - m3[2][i];
    4568     m1[3][i] = m3[1][i] - m3[3][i];
    4569     m1[4][i] = m3[4][i] + m3[6][i];
    4570     m1[5][i] = m3[5][i] + m3[7][i];
    4571     m1[6][i] = m3[4][i] - m3[6][i];
    4572     m1[7][i] = m3[5][i] - m3[7][i];
    4573    
    4574     m2[0][i] = m1[0][i] + m1[1][i];
    4575     m2[1][i] = m1[0][i] - m1[1][i];
    4576     m2[2][i] = m1[2][i] + m1[3][i];
    4577     m2[3][i] = m1[2][i] - m1[3][i];
    4578     m2[4][i] = m1[4][i] + m1[5][i];
    4579     m2[5][i] = m1[4][i] - m1[5][i];
    4580     m2[6][i] = m1[6][i] + m1[7][i];
    4581     m2[7][i] = m1[6][i] - m1[7][i];
    4582   }
    4583  
    4584   for (j=0; j < 8; j++)
    4585   {
    4586     for (i=0; i < 8; i++)
    4587       sad += (abs(m2[j][i]));
    4588   }
    4589  
     2382  for (i=0; i < 4; i++)
     2383  {
     2384    m2[0][i]  = m1[0][i] + m1[8][i];
     2385    m2[1][i]  = m1[1][i] + m1[9][i];
     2386    m2[2][i]  = m1[2][i] + m1[10][i];
     2387    m2[3][i]  = m1[3][i] + m1[11][i];
     2388    m2[4][i]  = m1[4][i] + m1[12][i];
     2389    m2[5][i]  = m1[5][i] + m1[13][i];
     2390    m2[6][i]  = m1[6][i] + m1[14][i];
     2391    m2[7][i]  = m1[7][i] + m1[15][i];
     2392    m2[8][i]  = m1[0][i] - m1[8][i];
     2393    m2[9][i]  = m1[1][i] - m1[9][i];
     2394    m2[10][i] = m1[2][i] - m1[10][i];
     2395    m2[11][i] = m1[3][i] - m1[11][i];
     2396    m2[12][i] = m1[4][i] - m1[12][i];
     2397    m2[13][i] = m1[5][i] - m1[13][i];
     2398    m2[14][i] = m1[6][i] - m1[14][i];
     2399    m2[15][i] = m1[7][i] - m1[15][i];
     2400
     2401    m3[0][i]  = m2[0][i]  + m2[4][i];
     2402    m3[1][i]  = m2[1][i]  + m2[5][i];
     2403    m3[2][i]  = m2[2][i]  + m2[6][i];
     2404    m3[3][i]  = m2[3][i]  + m2[7][i];
     2405    m3[4][i]  = m2[0][i]  - m2[4][i];
     2406    m3[5][i]  = m2[1][i]  - m2[5][i];
     2407    m3[6][i]  = m2[2][i]  - m2[6][i];
     2408    m3[7][i]  = m2[3][i]  - m2[7][i];
     2409    m3[8][i]  = m2[8][i]  + m2[12][i];
     2410    m3[9][i]  = m2[9][i]  + m2[13][i];
     2411    m3[10][i] = m2[10][i] + m2[14][i];
     2412    m3[11][i] = m2[11][i] + m2[15][i];
     2413    m3[12][i] = m2[8][i]  - m2[12][i];
     2414    m3[13][i] = m2[9][i]  - m2[13][i];
     2415    m3[14][i] = m2[10][i] - m2[14][i];
     2416    m3[15][i] = m2[11][i] - m2[15][i];
     2417
     2418    m1[0][i]  = m3[0][i]  + m3[2][i];
     2419    m1[1][i]  = m3[1][i]  + m3[3][i];
     2420    m1[2][i]  = m3[0][i]  - m3[2][i];
     2421    m1[3][i]  = m3[1][i]  - m3[3][i];
     2422    m1[4][i]  = m3[4][i]  + m3[6][i];
     2423    m1[5][i]  = m3[5][i]  + m3[7][i];
     2424    m1[6][i]  = m3[4][i]  - m3[6][i];
     2425    m1[7][i]  = m3[5][i]  - m3[7][i];
     2426    m1[8][i]  = m3[8][i]  + m3[10][i];
     2427    m1[9][i]  = m3[9][i]  + m3[11][i];
     2428    m1[10][i] = m3[8][i]  - m3[10][i];
     2429    m1[11][i] = m3[9][i]  - m3[11][i];
     2430    m1[12][i] = m3[12][i] + m3[14][i];
     2431    m1[13][i] = m3[13][i] + m3[15][i];
     2432    m1[14][i] = m3[12][i] - m3[14][i];
     2433    m1[15][i] = m3[13][i] - m3[15][i];
     2434
     2435    m2[0][i]  = m1[0][i]  + m1[1][i];
     2436    m2[1][i]  = m1[0][i]  - m1[1][i];
     2437    m2[2][i]  = m1[2][i]  + m1[3][i];
     2438    m2[3][i]  = m1[2][i]  - m1[3][i];
     2439    m2[4][i]  = m1[4][i]  + m1[5][i];
     2440    m2[5][i]  = m1[4][i]  - m1[5][i];
     2441    m2[6][i]  = m1[6][i]  + m1[7][i];
     2442    m2[7][i]  = m1[6][i]  - m1[7][i];
     2443    m2[8][i]  = m1[8][i]  + m1[9][i];
     2444    m2[9][i]  = m1[8][i]  - m1[9][i];
     2445    m2[10][i] = m1[10][i] + m1[11][i];
     2446    m2[11][i] = m1[10][i] - m1[11][i];
     2447    m2[12][i] = m1[12][i] + m1[13][i];
     2448    m2[13][i] = m1[12][i] - m1[13][i];
     2449    m2[14][i] = m1[14][i] + m1[15][i];
     2450    m2[15][i] = m1[14][i] - m1[15][i];
     2451  }
     2452
     2453  for (i = 0; i < 16; i++)
     2454  {
     2455    for (j = 0; j < 4; j++)
     2456    {
     2457      sad += abs(m2[i][j]);
     2458    }
     2459  }
     2460
    45902461  sad=((sad+2)>>2);
    4591  
     2462
    45922463  return sad;
    45932464}
     2465#endif
    45942466
    45952467UInt TComRdCost::xGetHADs4( DistParam* pcDtParam )
    45962468{
    4597 #ifdef WEIGHT_PRED
    4598   if ( pcDtParam->applyWeight )
     2469  if ( pcDtParam->bApplyWeight )
    45992470  {
    46002471    return xGetHADs4w( pcDtParam );
    46012472  }
    4602 #endif
    46032473  Pel* piOrg   = pcDtParam->pOrg;
    46042474  Pel* piCur   = pcDtParam->pCur;
     
    46252495UInt TComRdCost::xGetHADs8( DistParam* pcDtParam )
    46262496{
    4627 #ifdef WEIGHT_PRED
    4628   if ( pcDtParam->applyWeight )
     2497  if ( pcDtParam->bApplyWeight )
    46292498  {
    46302499    return xGetHADs8w( pcDtParam );
    46312500  }
    4632 #endif
    46332501  Pel* piOrg   = pcDtParam->pOrg;
    46342502  Pel* piCur   = pcDtParam->pCur;
     
    46632531UInt TComRdCost::xGetHADs( DistParam* pcDtParam )
    46642532{
    4665 #ifdef WEIGHT_PRED
    4666   if ( pcDtParam->applyWeight )
     2533  if ( pcDtParam->bApplyWeight )
    46672534  {
    46682535    return xGetHADsw( pcDtParam );
    46692536  }
    4670 #endif
    46712537  Pel* piOrg   = pcDtParam->pOrg;
    46722538  Pel* piCur   = pcDtParam->pCur;
     
    46812547  UInt uiSum = 0;
    46822548 
     2549#if NS_HAD
     2550  if( ( ( iRows % 8 == 0) && (iCols % 8 == 0) && ( iRows == iCols ) ) || ( ( iRows % 8 == 0 ) && (iCols % 8 == 0) && !pcDtParam->bUseNSHAD ) )
     2551#else
    46832552  if( ( iRows % 8 == 0) && (iCols % 8 == 0) )
     2553#endif
    46842554  {
    46852555    Int  iOffsetOrg = iStrideOrg<<3;
     
    46952565    }
    46962566  }
     2567#if NS_HAD
     2568  else if ( ( iCols > 8 ) && ( iCols > iRows ) && pcDtParam->bUseNSHAD )
     2569  {
     2570    Int  iOffsetOrg = iStrideOrg<<2;
     2571    Int  iOffsetCur = iStrideCur<<2;
     2572    for ( y=0; y<iRows; y+= 4 )
     2573    {
     2574      for ( x=0; x<iCols; x+= 16 )
     2575      {
     2576        uiSum += xCalcHADs16x4( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep );
     2577      }
     2578      piOrg += iOffsetOrg;
     2579      piCur += iOffsetCur;
     2580    }
     2581  }
     2582  else if ( ( iRows > 8 ) && ( iCols < iRows ) && pcDtParam->bUseNSHAD )
     2583  {
     2584    Int  iOffsetOrg = iStrideOrg<<4;
     2585    Int  iOffsetCur = iStrideCur<<4;
     2586    for ( y=0; y<iRows; y+= 16 )
     2587    {
     2588      for ( x=0; x<iCols; x+= 4 )
     2589      {
     2590        uiSum += xCalcHADs4x16( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep );
     2591      }
     2592      piOrg += iOffsetOrg;
     2593      piCur += iOffsetCur;
     2594    }
     2595  }
     2596#endif
    46972597  else if( ( iRows % 4 == 0) && (iCols % 4 == 0) )
    46982598  {
     
    47102610    }
    47112611  }
    4712 #ifdef DCM_RDCOST_TEMP_FIX //Temporary fix since row size can be 1 or 3 for chroma (such a case does not occur under current encoder settings)
    47132612  else if( ( iRows % 2 == 0) && (iCols % 2 == 0) )
    47142613  {
    47152614    Int  iOffsetOrg = iStrideOrg<<1;
    47162615    Int  iOffsetCur = iStrideCur<<1;
    4717 #else
    4718   else
    4719   {
    4720 #endif
    47212616    for ( y=0; y<iRows; y+=2 )
    47222617    {
     
    47252620        uiSum += xCalcHADs2x2( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep );
    47262621      }
    4727 #ifdef DCM_RDCOST_TEMP_FIX //Temporary fix since we need to increment by 2*iStride instead of iStride
    47282622      piOrg += iOffsetOrg;
    47292623      piCur += iOffsetCur;
    4730 #else
    4731       piOrg += iStrideOrg;
    4732       piCur += iStrideCur;
    4733 #endif
    4734     }
    4735   }
    4736 #ifdef DCM_RDCOST_TEMP_FIX //Temporary fix to return MAX_UINT until this case is properly handled
     2624    }
     2625  }
    47372626  else
    47382627  {
    4739     printf("xGetHADs not supported for this dimension. Skipping computation of HAD and returning MAX_UINT\n");
    4740     return (MAX_UINT);
    4741   }
    4742 #endif
     2628    assert(false);
     2629  }
    47432630 
    47442631  return ( uiSum >> g_uiBitIncrement );
    47452632}
    4746 
    47472633
    47482634#if HHI_VSO
     
    47832669};
    47842670
    4785 Void  TComRdCost::setRefDataFromMVDInfo( TComMVDRefData* pRefInfo )
    4786 {
    4787   if ( m_apRefPics != NULL )
    4788   {
    4789     delete[] m_apRefPics;
    4790     m_apRefPics = NULL;
    4791   };
    4792 
    4793   if ( m_paaiShiftLUTs != NULL )
    4794   { // Delete only first dimension, other dimension are not create in this class
    4795     delete[] m_paaiShiftLUTs;
    4796     m_paaiShiftLUTs = NULL;
    4797   };
    4798 
    4799 
    4800   m_uiNumberRefPics = ( m_uiVSOMode == 1 ) ? 3 : pRefInfo->getNumOfRefViews();
    4801   m_apRefPics     = new TComPicYuv*[ m_uiNumberRefPics ];
    4802   m_paaiShiftLUTs = new Int**[ m_uiNumberRefPics ];
    4803 
    4804   if ( m_uiVSOMode == 1 )
    4805   {
    4806     pRefInfo->getRefPicYuvAndLUTMode1(m_apRefPics, m_paaiShiftLUTs);
    4807   }
    4808   else
    4809   {
    4810     pRefInfo->getRefPicYuvAndLUT(m_apRefPics, m_paaiShiftLUTs);
    4811   }
    4812   m_pcVideoPicYuv = pRefInfo->getPicYuvVideo();
    4813 }
    48142671
    48152672Void TComRdCost::setVSOMode( UInt uiIn )
     
    48302687Double TComRdCost::calcRdCostVSO( UInt uiBits, Dist uiDistortion, Bool bFlag, DFunc eDFunc )
    48312688{
    4832   assert( m_bUseLambdaScaleVSO );
     2689  assert( m_bUseLambdaScaleVSO );  
    48332690
    48342691  Double dRdCost = 0.0;
    4835   Double dLambda = 0.0;
     2692  Double dLambda = 0.0;   
    48362693
    48372694  switch ( eDFunc )
     
    48952752
    48962753#endif
     2754//! \}
Note: See TracChangeset for help on using the changeset viewer.