Changeset 1313 in 3DVCSoftware for trunk/source/Lib/TLibCommon/TComTrQuant.cpp


Ignore:
Timestamp:
13 Aug 2015, 17:38:13 (9 years ago)
Author:
tech
Message:

Merged 14.1-update-dev1@1312.

File:
1 edited

Legend:

Unmodified
Added
Removed
  • trunk/source/Lib/TLibCommon/TComTrQuant.cpp

    r1179 r1313  
    22 * License, included below. This software may be subject to other third party
    33 * and contributor rights, including patent rights, and no such rights are
    4  * granted under this license. 
     4 * granted under this license.
    55 *
    6 * Copyright (c) 2010-2015, ITU/ISO/IEC
     6 * Copyright (c) 2010-2015, ITU/ISO/IEC
    77 * All rights reserved.
    88 *
     
    3838#include <stdlib.h>
    3939#include <math.h>
     40#include <limits>
    4041#include <memory.h>
    4142#include "TComTrQuant.h"
    4243#include "TComPic.h"
    4344#include "ContextTables.h"
     45#include "TComTU.h"
     46#include "Debug.h"
    4447
    4548typedef struct
     
    6164#define RDOQ_CHROMA                 1           ///< use of RDOQ in chroma
    6265
     66
    6367// ====================================================================================================================
    64 // Tables
     68// QpParam constructor
    6569// ====================================================================================================================
    6670
    67 // RDOQ parameter
    68 
    69 // ====================================================================================================================
    70 // Qp class member functions
    71 // ====================================================================================================================
    72 
    73 QpParam::QpParam()
    74 {
    75 }
     71QpParam::QpParam(const Int           qpy,
     72                 const ChannelType   chType,
     73                 const Int           qpBdOffset,
     74                 const Int           chromaQPOffset,
     75                 const ChromaFormat  chFmt )
     76{
     77  Int baseQp;
     78
     79  if(isLuma(chType))
     80  {
     81    baseQp = qpy + qpBdOffset;
     82  }
     83  else
     84  {
     85    baseQp = Clip3( -qpBdOffset, (chromaQPMappingTableSize - 1), qpy + chromaQPOffset );
     86
     87    if(baseQp < 0)
     88    {
     89      baseQp = baseQp + qpBdOffset;
     90    }
     91    else
     92    {
     93      baseQp = getScaledChromaQP(baseQp, chFmt) + qpBdOffset;
     94    }
     95  }
     96
     97  Qp =baseQp;
     98  per=baseQp/6;
     99  rem=baseQp%6;
     100}
     101
     102QpParam::QpParam(const TComDataCU &cu, const ComponentID compID)
     103{
     104  Int chromaQpOffset = 0;
     105
     106  if (isChroma(compID))
     107  {
     108    chromaQpOffset += cu.getSlice()->getPPS()->getQpOffset(compID);
     109    chromaQpOffset += cu.getSlice()->getSliceChromaQpDelta(compID);
     110
     111    chromaQpOffset += cu.getSlice()->getPPS()->getPpsRangeExtension().getChromaQpOffsetListEntry(cu.getChromaQpAdj(0)).u.offset[Int(compID)-1];
     112  }
     113
     114  *this = QpParam(cu.getQP( 0 ),
     115                  toChannelType(compID),
     116                  cu.getSlice()->getSPS()->getQpBDOffset(toChannelType(compID)),
     117                  chromaQpOffset,
     118                  cu.getPic()->getChromaFormat());
     119}
     120
    76121
    77122// ====================================================================================================================
     
    81126TComTrQuant::TComTrQuant()
    82127{
    83   m_cQP.clear();
    84  
    85128  // allocate temporary buffers
    86   m_plTempCoeff  = new Int[ MAX_CU_SIZE*MAX_CU_SIZE ];
    87  
     129  m_plTempCoeff  = new TCoeff[ MAX_CU_SIZE*MAX_CU_SIZE ];
     130
    88131  // allocate bit estimation class  (for RDOQ)
    89132  m_pcEstBitsSbac = new estBitsSbacStruct;
     
    99142    m_plTempCoeff = NULL;
    100143  }
    101  
     144
    102145  // delete bit estimation class
    103146  if ( m_pcEstBitsSbac )
     
    111154Void TComTrQuant::storeSliceQpNext(TComSlice* pcSlice)
    112155{
     156  // NOTE: does this work with negative QPs or when some blocks are transquant-bypass enabled?
     157
    113158  Int qpBase = pcSlice->getSliceQpBase();
    114159  Int sliceQpused = pcSlice->getSliceQp();
    115160  Int sliceQpnext;
    116161  Double alpha = qpBase < 17 ? 0.5 : 1;
    117  
     162
    118163  Int cnt=0;
    119164  for(Int u=1; u<=LEVEL_RANGE; u++)
    120   { 
     165  {
    121166    cnt += m_sliceNsamples[u] ;
    122167  }
     
    161206  }
    162207
    163   m_qpDelta[qpBase] = sliceQpnext - qpBase; 
     208  m_qpDelta[qpBase] = sliceQpnext - qpBase;
    164209}
    165210
     
    173218
    174219Void TComTrQuant::clearSliceARLCnt()
    175 { 
     220{
    176221  memset(m_sliceSumC, 0, sizeof(Double)*(LEVEL_RANGE+1));
    177222  memset(m_sliceNsamples, 0, sizeof(Int)*(LEVEL_RANGE+1));
     
    180225
    181226
    182 /** Set qP for Quantization.
    183  * \param qpy QPy
    184  * \param bLowpass
    185  * \param eSliceType
    186  * \param eTxtType
    187  * \param qpBdOffset
    188  * \param chromaQPOffset
    189  *
    190  * return void 
    191  */
    192 Void TComTrQuant::setQPforQuant( Int qpy, TextType eTxtType, Int qpBdOffset, Int chromaQPOffset)
    193 {
    194   Int qpScaled;
    195 
    196   if(eTxtType == TEXT_LUMA)
    197   {
    198     qpScaled = qpy + qpBdOffset;
    199   }
    200   else
    201   {
    202     qpScaled = Clip3( -qpBdOffset, 57, qpy + chromaQPOffset );
    203 
    204     if(qpScaled < 0)
    205     {
    206       qpScaled = qpScaled + qpBdOffset;
    207     }
    208     else
    209     {
    210       qpScaled = g_aucChromaScale[ qpScaled ] + qpBdOffset;
    211     }
    212   }
    213   m_cQP.setQpParam( qpScaled );
    214 }
    215227
    216228#if MATRIX_MULT
     
    222234 *  \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
    223235 */
    224 void xTr(Int bitDepth, Pel *block, Int *coeff, UInt uiStride, UInt uiTrSize, UInt uiMode)
    225 {
    226   Int i,j,k,iSum;
    227   Int tmp[32*32];
    228   const Short *iT;
     236Void xTr(Int bitDepth, Pel *block, TCoeff *coeff, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
     237{
     238  UInt i,j,k;
     239  TCoeff iSum;
     240  TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
     241  const TMatrixCoeff *iT;
    229242  UInt uiLog2TrSize = g_aucConvertToBit[ uiTrSize ] + 2;
    230243
    231244  if (uiTrSize==4)
    232245  {
    233     iT  = g_aiT4[0];
     246    iT  = (useDST ? g_as_DST_MAT_4[TRANSFORM_FORWARD][0] : g_aiT4[TRANSFORM_FORWARD][0]);
    234247  }
    235248  else if (uiTrSize==8)
    236249  {
    237     iT = g_aiT8[0];
     250    iT = g_aiT8[TRANSFORM_FORWARD][0];
    238251  }
    239252  else if (uiTrSize==16)
    240253  {
    241     iT = g_aiT16[0];
     254    iT = g_aiT16[TRANSFORM_FORWARD][0];
    242255  }
    243256  else if (uiTrSize==32)
    244257  {
    245     iT = g_aiT32[0];
     258    iT = g_aiT32[TRANSFORM_FORWARD][0];
    246259  }
    247260  else
     
    250263  }
    251264
    252   Int shift_1st = uiLog2TrSize - 1 + bitDepth-8; // log2(N) - 1 + g_bitDepth-8
    253   Int add_1st = 1<<(shift_1st-1);
    254   Int shift_2nd = uiLog2TrSize + 6;
    255   Int add_2nd = 1<<(shift_2nd-1);
     265  const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
     266
     267  const Int shift_1st = (uiLog2TrSize +  bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
     268  const Int shift_2nd = uiLog2TrSize + TRANSFORM_MATRIX_SHIFT;
     269  const Int add_1st = (shift_1st>0) ? (1<<(shift_1st-1)) : 0;
     270  const Int add_2nd = 1<<(shift_2nd-1);
    256271
    257272  /* Horizontal transform */
    258273
    259   if (uiTrSize==4)
    260   {
    261     if (uiMode != REG_DCT && g_aucDCTDSTMode_Hor[uiMode])
    262     {
    263       iT  =  g_as_DST_MAT_4[0];
    264     }
    265   }
    266274  for (i=0; i<uiTrSize; i++)
    267275  {
     
    276284    }
    277285  }
    278  
     286
    279287  /* Vertical transform */
    280   if (uiTrSize==4)
    281   {
    282     if (uiMode != REG_DCT && g_aucDCTDSTMode_Vert[uiMode])
    283     {
    284       iT  =  g_as_DST_MAT_4[0];
    285     }
    286     else
    287     {
    288       iT  = g_aiT4[0];
    289     }
    290   }
    291288  for (i=0; i<uiTrSize; i++)
    292   {                 
     289  {
    293290    for (j=0; j<uiTrSize; j++)
    294291    {
     
    296293      for (k=0; k<uiTrSize; k++)
    297294      {
    298         iSum += iT[i*uiTrSize+k]*tmp[j*uiTrSize+k];       
    299       }
    300       coeff[i*uiTrSize+j] = (iSum + add_2nd)>>shift_2nd; 
     295        iSum += iT[i*uiTrSize+k]*tmp[j*uiTrSize+k];
     296      }
     297      coeff[i*uiTrSize+j] = (iSum + add_2nd)>>shift_2nd;
    301298    }
    302299  }
     
    310307 *  \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
    311308 */
    312 void xITr(Int *coeff, Pel *block, UInt uiStride, UInt uiTrSize, UInt uiMode)
    313 {
    314   Int i,j,k,iSum;
    315   Int tmp[32*32];
    316   const Short *iT;
    317  
     309Void xITr(Int bitDepth, TCoeff *coeff, Pel *block, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
     310{
     311  UInt i,j,k;
     312  TCoeff iSum;
     313  TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
     314  const TMatrixCoeff *iT;
     315
    318316  if (uiTrSize==4)
    319317  {
    320     iT  = g_aiT4[0];
     318    iT  = (useDST ? g_as_DST_MAT_4[TRANSFORM_INVERSE][0] : g_aiT4[TRANSFORM_INVERSE][0]);
    321319  }
    322320  else if (uiTrSize==8)
    323321  {
    324     iT = g_aiT8[0];
     322    iT = g_aiT8[TRANSFORM_INVERSE][0];
    325323  }
    326324  else if (uiTrSize==16)
    327325  {
    328     iT = g_aiT16[0];
     326    iT = g_aiT16[TRANSFORM_INVERSE][0];
    329327  }
    330328  else if (uiTrSize==32)
    331329  {
    332     iT = g_aiT32[0];
     330    iT = g_aiT32[TRANSFORM_INVERSE][0];
    333331  }
    334332  else
     
    336334    assert(0);
    337335  }
    338  
    339   Int shift_1st = SHIFT_INV_1ST;
    340   Int add_1st = 1<<(shift_1st-1);
    341   Int shift_2nd = SHIFT_INV_2ND - g_bitDepth-8;
    342   Int add_2nd = 1<<(shift_2nd-1);
    343   if (uiTrSize==4)
    344   {
    345     if (uiMode != REG_DCT && g_aucDCTDSTMode_Vert[uiMode] ) // Check for DCT or DST
    346     {
    347       iT  =  g_as_DST_MAT_4[0];
    348     }
    349   }
    350  
     336
     337  const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
     338
     339  const Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
     340  const Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
     341  const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
     342  const TCoeff clipMaximum =  (1 << maxLog2TrDynamicRange) - 1;
     343  assert(shift_2nd>=0);
     344  const Int add_1st = 1<<(shift_1st-1);
     345  const Int add_2nd = (shift_2nd>0) ? (1<<(shift_2nd-1)) : 0;
     346
    351347  /* Horizontal transform */
    352348  for (i=0; i<uiTrSize; i++)
    353   {   
     349  {
    354350    for (j=0; j<uiTrSize; j++)
    355351    {
    356352      iSum = 0;
    357353      for (k=0; k<uiTrSize; k++)
    358       {       
    359         iSum += iT[k*uiTrSize+i]*coeff[k*uiTrSize+j];
    360       }
    361       tmp[i*uiTrSize+j] = Clip3(-32768, 32767, (iSum + add_1st)>>shift_1st); // Clipping is normative
    362     }
    363   }   
    364  
    365   if (uiTrSize==4)
    366   {
    367     if (uiMode != REG_DCT && g_aucDCTDSTMode_Hor[uiMode] )   // Check for DCT or DST
    368     {
    369       iT  =  g_as_DST_MAT_4[0];
    370     }
    371     else 
    372     {
    373       iT  = g_aiT4[0];
    374     }
    375   }
    376  
     354      {
     355        iSum += iT[k*uiTrSize+i]*coeff[k*uiTrSize+j];
     356      }
     357
     358      // Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
     359      tmp[i*uiTrSize+j] = Clip3<TCoeff>(clipMinimum, clipMaximum, (iSum + add_1st)>>shift_1st);
     360    }
     361  }
     362
    377363  /* Vertical transform */
    378364  for (i=0; i<uiTrSize; i++)
    379   {   
     365  {
    380366    for (j=0; j<uiTrSize; j++)
    381367    {
    382368      iSum = 0;
    383369      for (k=0; k<uiTrSize; k++)
    384       {       
     370      {
    385371        iSum += iT[k*uiTrSize+j]*tmp[i*uiTrSize+k];
    386372      }
    387       block[i*uiStride+j] = Clip3(-32768, 32767, (iSum + add_2nd)>>shift_2nd); // Clipping is non-normative
    388     }
    389   }
    390 }
    391 
    392 #else //MATRIX_MULT
     373
     374      block[i*uiStride+j] = Clip3<TCoeff>(std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max(), (iSum + add_2nd)>>shift_2nd);
     375    }
     376  }
     377}
     378
     379#endif //MATRIX_MULT
     380
    393381
    394382/** 4x4 forward transform implemented using partial butterfly structure (1D)
     
    396384 *  \param dst   output data (transform coefficients)
    397385 *  \param shift specifies right shift after 1D transform
     386 *  \param line
    398387 */
    399 
    400 void partialButterfly4(Short *src,Short *dst,Int shift, Int line)
     388Void partialButterfly4(TCoeff *src, TCoeff *dst, Int shift, Int line)
    401389{
    402390  Int j;
    403   Int E[2],O[2];
    404   Int add = 1<<(shift-1);
     391  TCoeff E[2],O[2];
     392  TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
    405393
    406394  for (j=0; j<line; j++)
    407   {   
     395  {
    408396    /* E and O */
    409397    E[0] = src[0] + src[3];
     
    412400    O[1] = src[1] - src[2];
    413401
    414     dst[0] = (g_aiT4[0][0]*E[0] + g_aiT4[0][1]*E[1] + add)>>shift;
    415     dst[2*line] = (g_aiT4[2][0]*E[0] + g_aiT4[2][1]*E[1] + add)>>shift;
    416     dst[line] = (g_aiT4[1][0]*O[0] + g_aiT4[1][1]*O[1] + add)>>shift;
    417     dst[3*line] = (g_aiT4[3][0]*O[0] + g_aiT4[3][1]*O[1] + add)>>shift;
     402    dst[0]      = (g_aiT4[TRANSFORM_FORWARD][0][0]*E[0] + g_aiT4[TRANSFORM_FORWARD][0][1]*E[1] + add)>>shift;
     403    dst[2*line] = (g_aiT4[TRANSFORM_FORWARD][2][0]*E[0] + g_aiT4[TRANSFORM_FORWARD][2][1]*E[1] + add)>>shift;
     404    dst[line]   = (g_aiT4[TRANSFORM_FORWARD][1][0]*O[0] + g_aiT4[TRANSFORM_FORWARD][1][1]*O[1] + add)>>shift;
     405    dst[3*line] = (g_aiT4[TRANSFORM_FORWARD][3][0]*O[0] + g_aiT4[TRANSFORM_FORWARD][3][1]*O[1] + add)>>shift;
    418406
    419407    src += 4;
     
    422410}
    423411
    424 // Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm 
     412// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
    425413// give identical results
    426 void fastForwardDst(Short *block,Short *coeff,Int shift)  // input block, output coeff
    427 {
    428   Int i, c[4];
    429   Int rnd_factor = 1<<(shift-1);
     414Void fastForwardDst(TCoeff *block, TCoeff *coeff, Int shift)  // input block, output coeff
     415{
     416  Int i;
     417  TCoeff c[4];
     418  TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
    430419  for (i=0; i<4; i++)
    431420  {
    432421    // Intermediate Variables
    433     c[0] = block[4*i+0] + block[4*i+3];
    434     c[1] = block[4*i+1] + block[4*i+3];
    435     c[2] = block[4*i+0] - block[4*i+1];
    436     c[3] = 74* block[4*i+2];
    437 
    438     coeff[   i] =  ( 29 * c[0] + 55 * c[1]         + c[3]               + rnd_factor ) >> shift;
    439     coeff[ 4+i] =  ( 74 * (block[4*i+0]+ block[4*i+1] - block[4*i+3])   + rnd_factor ) >> shift;
    440     coeff[ 8+i] =  ( 29 * c[2] + 55 * c[0]         - c[3]               + rnd_factor ) >> shift;
    441     coeff[12+i] =  ( 55 * c[2] - 29 * c[1]         + c[3]               + rnd_factor ) >> shift;
    442   }
    443 }
    444 
    445 void fastInverseDst(Short *tmp,Short *block,Int shift)  // input tmp, output block
    446 {
    447   Int i, c[4];
    448   Int rnd_factor = 1<<(shift-1);
     422    c[0] = block[4*i+0];
     423    c[1] = block[4*i+1];
     424    c[2] = block[4*i+2];
     425    c[3] = block[4*i+3];
     426
     427    for (Int row = 0; row < 4; row++)
     428    {
     429      TCoeff result = 0;
     430      for (Int column = 0; column < 4; column++)
     431      {
     432        result += c[column] * g_as_DST_MAT_4[TRANSFORM_FORWARD][row][column]; // use the defined matrix, rather than hard-wired numbers
     433      }
     434
     435      coeff[(row * 4) + i] = rightShift((result + rnd_factor), shift);
     436    }
     437  }
     438}
     439
     440Void fastInverseDst(TCoeff *tmp, TCoeff *block, Int shift, const TCoeff outputMinimum, const TCoeff outputMaximum)  // input tmp, output block
     441{
     442  Int i;
     443  TCoeff c[4];
     444  TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
    449445  for (i=0; i<4; i++)
    450   { 
     446  {
    451447    // Intermediate Variables
    452     c[0] = tmp[  i] + tmp[ 8+i];
    453     c[1] = tmp[8+i] + tmp[12+i];
    454     c[2] = tmp[  i] - tmp[12+i];
    455     c[3] = 74* tmp[4+i];
    456 
    457     block[4*i+0] = Clip3( -32768, 32767, ( 29 * c[0] + 55 * c[1]     + c[3]               + rnd_factor ) >> shift );
    458     block[4*i+1] = Clip3( -32768, 32767, ( 55 * c[2] - 29 * c[1]     + c[3]               + rnd_factor ) >> shift );
    459     block[4*i+2] = Clip3( -32768, 32767, ( 74 * (tmp[i] - tmp[8+i]  + tmp[12+i])      + rnd_factor ) >> shift );
    460     block[4*i+3] = Clip3( -32768, 32767, ( 55 * c[0] + 29 * c[2]     - c[3]               + rnd_factor ) >> shift );
    461   }
    462 }
    463 
    464 void partialButterflyInverse4(Short *src,Short *dst,Int shift, Int line)
     448    c[0] = tmp[   i];
     449    c[1] = tmp[4 +i];
     450    c[2] = tmp[8 +i];
     451    c[3] = tmp[12+i];
     452
     453    for (Int column = 0; column < 4; column++)
     454    {
     455      TCoeff &result = block[(i * 4) + column];
     456
     457      result = 0;
     458      for (Int row = 0; row < 4; row++)
     459      {
     460        result += c[row] * g_as_DST_MAT_4[TRANSFORM_INVERSE][row][column]; // use the defined matrix, rather than hard-wired numbers
     461      }
     462
     463      result = Clip3( outputMinimum, outputMaximum, rightShift((result + rnd_factor), shift));
     464    }
     465  }
     466}
     467
     468/** 4x4 inverse transform implemented using partial butterfly structure (1D)
     469 *  \param src   input data (transform coefficients)
     470 *  \param dst   output data (residual)
     471 *  \param shift specifies right shift after 1D transform
     472 *  \param line
     473 *  \param outputMinimum  minimum for clipping
     474 *  \param outputMaximum  maximum for clipping
     475 */
     476Void partialButterflyInverse4(TCoeff *src, TCoeff *dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
    465477{
    466478  Int j;
    467   Int E[2],O[2];
    468   Int add = 1<<(shift-1);
     479  TCoeff E[2],O[2];
     480  TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
    469481
    470482  for (j=0; j<line; j++)
    471   {   
    472     /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */   
    473     O[0] = g_aiT4[1][0]*src[line] + g_aiT4[3][0]*src[3*line];
    474     O[1] = g_aiT4[1][1]*src[line] + g_aiT4[3][1]*src[3*line];
    475     E[0] = g_aiT4[0][0]*src[0] + g_aiT4[2][0]*src[2*line];
    476     E[1] = g_aiT4[0][1]*src[0] + g_aiT4[2][1]*src[2*line];
     483  {
     484    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
     485    O[0] = g_aiT4[TRANSFORM_INVERSE][1][0]*src[line] + g_aiT4[TRANSFORM_INVERSE][3][0]*src[3*line];
     486    O[1] = g_aiT4[TRANSFORM_INVERSE][1][1]*src[line] + g_aiT4[TRANSFORM_INVERSE][3][1]*src[3*line];
     487    E[0] = g_aiT4[TRANSFORM_INVERSE][0][0]*src[0]    + g_aiT4[TRANSFORM_INVERSE][2][0]*src[2*line];
     488    E[1] = g_aiT4[TRANSFORM_INVERSE][0][1]*src[0]    + g_aiT4[TRANSFORM_INVERSE][2][1]*src[2*line];
    477489
    478490    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
    479     dst[0] = Clip3( -32768, 32767, (E[0] + O[0] + add)>>shift );
    480     dst[1] = Clip3( -32768, 32767, (E[1] + O[1] + add)>>shift );
    481     dst[2] = Clip3( -32768, 32767, (E[1] - O[1] + add)>>shift );
    482     dst[3] = Clip3( -32768, 32767, (E[0] - O[0] + add)>>shift );
    483            
     491    dst[0] = Clip3( outputMinimum, outputMaximum, (E[0] + O[0] + add)>>shift );
     492    dst[1] = Clip3( outputMinimum, outputMaximum, (E[1] + O[1] + add)>>shift );
     493    dst[2] = Clip3( outputMinimum, outputMaximum, (E[1] - O[1] + add)>>shift );
     494    dst[3] = Clip3( outputMinimum, outputMaximum, (E[0] - O[0] + add)>>shift );
     495
    484496    src   ++;
    485497    dst += 4;
     
    487499}
    488500
    489 
    490 void partialButterfly8(Short *src,Short *dst,Int shift, Int line)
     501/** 8x8 forward transform implemented using partial butterfly structure (1D)
     502 *  \param src   input data (residual)
     503 *  \param dst   output data (transform coefficients)
     504 *  \param shift specifies right shift after 1D transform
     505 *  \param line
     506 */
     507Void partialButterfly8(TCoeff *src, TCoeff *dst, Int shift, Int line)
    491508{
    492509  Int j,k;
    493   Int E[4],O[4];
    494   Int EE[2],EO[2];
    495   Int add = 1<<(shift-1);
     510  TCoeff E[4],O[4];
     511  TCoeff EE[2],EO[2];
     512  TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
    496513
    497514  for (j=0; j<line; j++)
    498   { 
     515  {
    499516    /* E and O*/
    500517    for (k=0;k<4;k++)
     
    502519      E[k] = src[k] + src[7-k];
    503520      O[k] = src[k] - src[7-k];
    504     }   
     521    }
    505522    /* EE and EO */
    506     EE[0] = E[0] + E[3];   
     523    EE[0] = E[0] + E[3];
    507524    EO[0] = E[0] - E[3];
    508525    EE[1] = E[1] + E[2];
    509526    EO[1] = E[1] - E[2];
    510527
    511     dst[0] = (g_aiT8[0][0]*EE[0] + g_aiT8[0][1]*EE[1] + add)>>shift;
    512     dst[4*line] = (g_aiT8[4][0]*EE[0] + g_aiT8[4][1]*EE[1] + add)>>shift;
    513     dst[2*line] = (g_aiT8[2][0]*EO[0] + g_aiT8[2][1]*EO[1] + add)>>shift;
    514     dst[6*line] = (g_aiT8[6][0]*EO[0] + g_aiT8[6][1]*EO[1] + add)>>shift;
    515 
    516     dst[line] = (g_aiT8[1][0]*O[0] + g_aiT8[1][1]*O[1] + g_aiT8[1][2]*O[2] + g_aiT8[1][3]*O[3] + add)>>shift;
    517     dst[3*line] = (g_aiT8[3][0]*O[0] + g_aiT8[3][1]*O[1] + g_aiT8[3][2]*O[2] + g_aiT8[3][3]*O[3] + add)>>shift;
    518     dst[5*line] = (g_aiT8[5][0]*O[0] + g_aiT8[5][1]*O[1] + g_aiT8[5][2]*O[2] + g_aiT8[5][3]*O[3] + add)>>shift;
    519     dst[7*line] = (g_aiT8[7][0]*O[0] + g_aiT8[7][1]*O[1] + g_aiT8[7][2]*O[2] + g_aiT8[7][3]*O[3] + add)>>shift;
     528    dst[0]      = (g_aiT8[TRANSFORM_FORWARD][0][0]*EE[0] + g_aiT8[TRANSFORM_FORWARD][0][1]*EE[1] + add)>>shift;
     529    dst[4*line] = (g_aiT8[TRANSFORM_FORWARD][4][0]*EE[0] + g_aiT8[TRANSFORM_FORWARD][4][1]*EE[1] + add)>>shift;
     530    dst[2*line] = (g_aiT8[TRANSFORM_FORWARD][2][0]*EO[0] + g_aiT8[TRANSFORM_FORWARD][2][1]*EO[1] + add)>>shift;
     531    dst[6*line] = (g_aiT8[TRANSFORM_FORWARD][6][0]*EO[0] + g_aiT8[TRANSFORM_FORWARD][6][1]*EO[1] + add)>>shift;
     532
     533    dst[line]   = (g_aiT8[TRANSFORM_FORWARD][1][0]*O[0] + g_aiT8[TRANSFORM_FORWARD][1][1]*O[1] + g_aiT8[TRANSFORM_FORWARD][1][2]*O[2] + g_aiT8[TRANSFORM_FORWARD][1][3]*O[3] + add)>>shift;
     534    dst[3*line] = (g_aiT8[TRANSFORM_FORWARD][3][0]*O[0] + g_aiT8[TRANSFORM_FORWARD][3][1]*O[1] + g_aiT8[TRANSFORM_FORWARD][3][2]*O[2] + g_aiT8[TRANSFORM_FORWARD][3][3]*O[3] + add)>>shift;
     535    dst[5*line] = (g_aiT8[TRANSFORM_FORWARD][5][0]*O[0] + g_aiT8[TRANSFORM_FORWARD][5][1]*O[1] + g_aiT8[TRANSFORM_FORWARD][5][2]*O[2] + g_aiT8[TRANSFORM_FORWARD][5][3]*O[3] + add)>>shift;
     536    dst[7*line] = (g_aiT8[TRANSFORM_FORWARD][7][0]*O[0] + g_aiT8[TRANSFORM_FORWARD][7][1]*O[1] + g_aiT8[TRANSFORM_FORWARD][7][2]*O[2] + g_aiT8[TRANSFORM_FORWARD][7][3]*O[3] + add)>>shift;
    520537
    521538    src += 8;
     
    524541}
    525542
    526 
    527 void partialButterflyInverse8(Short *src,Short *dst,Int shift, Int line)
     543/** 8x8 inverse transform implemented using partial butterfly structure (1D)
     544 *  \param src   input data (transform coefficients)
     545 *  \param dst   output data (residual)
     546 *  \param shift specifies right shift after 1D transform
     547 *  \param line
     548 *  \param outputMinimum  minimum for clipping
     549 *  \param outputMaximum  maximum for clipping
     550 */
     551Void partialButterflyInverse8(TCoeff *src, TCoeff *dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
    528552{
    529553  Int j,k;
    530   Int E[4],O[4];
    531   Int EE[2],EO[2];
    532   Int add = 1<<(shift-1);
    533 
    534   for (j=0; j<line; j++) 
    535   {   
     554  TCoeff E[4],O[4];
     555  TCoeff EE[2],EO[2];
     556  TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
     557
     558  for (j=0; j<line; j++)
     559  {
    536560    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    537561    for (k=0;k<4;k++)
    538562    {
    539       O[k] = g_aiT8[ 1][k]*src[line] + g_aiT8[ 3][k]*src[3*line] + g_aiT8[ 5][k]*src[5*line] + g_aiT8[ 7][k]*src[7*line];
    540     }
    541 
    542     EO[0] = g_aiT8[2][0]*src[ 2*line ] + g_aiT8[6][0]*src[ 6*line ];
    543     EO[1] = g_aiT8[2][1]*src[ 2*line ] + g_aiT8[6][1]*src[ 6*line ];
    544     EE[0] = g_aiT8[0][0]*src[ 0      ] + g_aiT8[4][0]*src[ 4*line ];
    545     EE[1] = g_aiT8[0][1]*src[ 0      ] + g_aiT8[4][1]*src[ 4*line ];
    546 
    547     /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
     563      O[k] = g_aiT8[TRANSFORM_INVERSE][ 1][k]*src[line]   + g_aiT8[TRANSFORM_INVERSE][ 3][k]*src[3*line] +
     564             g_aiT8[TRANSFORM_INVERSE][ 5][k]*src[5*line] + g_aiT8[TRANSFORM_INVERSE][ 7][k]*src[7*line];
     565    }
     566
     567    EO[0] = g_aiT8[TRANSFORM_INVERSE][2][0]*src[ 2*line ] + g_aiT8[TRANSFORM_INVERSE][6][0]*src[ 6*line ];
     568    EO[1] = g_aiT8[TRANSFORM_INVERSE][2][1]*src[ 2*line ] + g_aiT8[TRANSFORM_INVERSE][6][1]*src[ 6*line ];
     569    EE[0] = g_aiT8[TRANSFORM_INVERSE][0][0]*src[ 0      ] + g_aiT8[TRANSFORM_INVERSE][4][0]*src[ 4*line ];
     570    EE[1] = g_aiT8[TRANSFORM_INVERSE][0][1]*src[ 0      ] + g_aiT8[TRANSFORM_INVERSE][4][1]*src[ 4*line ];
     571
     572    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
    548573    E[0] = EE[0] + EO[0];
    549574    E[3] = EE[0] - EO[0];
     
    552577    for (k=0;k<4;k++)
    553578    {
    554       dst[ k   ] = Clip3( -32768, 32767, (E[k] + O[k] + add)>>shift );
    555       dst[ k+4 ] = Clip3( -32768, 32767, (E[3-k] - O[3-k] + add)>>shift );
    556     }   
     579      dst[ k   ] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
     580      dst[ k+4 ] = Clip3( outputMinimum, outputMaximum, (E[3-k] - O[3-k] + add)>>shift );
     581    }
    557582    src ++;
    558583    dst += 8;
     
    560585}
    561586
    562 
    563 void partialButterfly16(Short *src,Short *dst,Int shift, Int line)
     587/** 16x16 forward transform implemented using partial butterfly structure (1D)
     588 *  \param src   input data (residual)
     589 *  \param dst   output data (transform coefficients)
     590 *  \param shift specifies right shift after 1D transform
     591 *  \param line
     592 */
     593Void partialButterfly16(TCoeff *src, TCoeff *dst, Int shift, Int line)
    564594{
    565595  Int j,k;
    566   Int E[8],O[8];
    567   Int EE[4],EO[4];
    568   Int EEE[2],EEO[2];
    569   Int add = 1<<(shift-1);
    570 
    571   for (j=0; j<line; j++) 
    572   {   
     596  TCoeff E[8],O[8];
     597  TCoeff EE[4],EO[4];
     598  TCoeff EEE[2],EEO[2];
     599  TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
     600
     601  for (j=0; j<line; j++)
     602  {
    573603    /* E and O*/
    574604    for (k=0;k<8;k++)
     
    576606      E[k] = src[k] + src[15-k];
    577607      O[k] = src[k] - src[15-k];
    578     } 
     608    }
    579609    /* EE and EO */
    580610    for (k=0;k<4;k++)
     
    584614    }
    585615    /* EEE and EEO */
    586     EEE[0] = EE[0] + EE[3];   
     616    EEE[0] = EE[0] + EE[3];
    587617    EEO[0] = EE[0] - EE[3];
    588618    EEE[1] = EE[1] + EE[2];
    589619    EEO[1] = EE[1] - EE[2];
    590620
    591     dst[ 0      ] = (g_aiT16[ 0][0]*EEE[0] + g_aiT16[ 0][1]*EEE[1] + add)>>shift;       
    592     dst[ 8*line ] = (g_aiT16[ 8][0]*EEE[0] + g_aiT16[ 8][1]*EEE[1] + add)>>shift;   
    593     dst[ 4*line ] = (g_aiT16[ 4][0]*EEO[0] + g_aiT16[ 4][1]*EEO[1] + add)>>shift;       
    594     dst[ 12*line] = (g_aiT16[12][0]*EEO[0] + g_aiT16[12][1]*EEO[1] + add)>>shift;
     621    dst[ 0      ] = (g_aiT16[TRANSFORM_FORWARD][ 0][0]*EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 0][1]*EEE[1] + add)>>shift;
     622    dst[ 8*line ] = (g_aiT16[TRANSFORM_FORWARD][ 8][0]*EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 8][1]*EEE[1] + add)>>shift;
     623    dst[ 4*line ] = (g_aiT16[TRANSFORM_FORWARD][ 4][0]*EEO[0] + g_aiT16[TRANSFORM_FORWARD][ 4][1]*EEO[1] + add)>>shift;
     624    dst[ 12*line] = (g_aiT16[TRANSFORM_FORWARD][12][0]*EEO[0] + g_aiT16[TRANSFORM_FORWARD][12][1]*EEO[1] + add)>>shift;
    595625
    596626    for (k=2;k<16;k+=4)
    597627    {
    598       dst[ k*line ] = (g_aiT16[k][0]*EO[0] + g_aiT16[k][1]*EO[1] + g_aiT16[k][2]*EO[2] + g_aiT16[k][3]*EO[3] + add)>>shift;     
     628      dst[ k*line ] = (g_aiT16[TRANSFORM_FORWARD][k][0]*EO[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*EO[1] +
     629                       g_aiT16[TRANSFORM_FORWARD][k][2]*EO[2] + g_aiT16[TRANSFORM_FORWARD][k][3]*EO[3] + add)>>shift;
    599630    }
    600631
    601632    for (k=1;k<16;k+=2)
    602633    {
    603       dst[ k*line ] = (g_aiT16[k][0]*O[0] + g_aiT16[k][1]*O[1] + g_aiT16[k][2]*O[2] + g_aiT16[k][3]*O[3] +
    604         g_aiT16[k][4]*O[4] + g_aiT16[k][5]*O[5] + g_aiT16[k][6]*O[6] + g_aiT16[k][7]*O[7] + add)>>shift;
     634      dst[ k*line ] = (g_aiT16[TRANSFORM_FORWARD][k][0]*O[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*O[1] +
     635                       g_aiT16[TRANSFORM_FORWARD][k][2]*O[2] + g_aiT16[TRANSFORM_FORWARD][k][3]*O[3] +
     636                       g_aiT16[TRANSFORM_FORWARD][k][4]*O[4] + g_aiT16[TRANSFORM_FORWARD][k][5]*O[5] +
     637                       g_aiT16[TRANSFORM_FORWARD][k][6]*O[6] + g_aiT16[TRANSFORM_FORWARD][k][7]*O[7] + add)>>shift;
    605638    }
    606639
    607640    src += 16;
    608     dst ++;
    609 
    610   }
    611 }
    612 
    613 
    614 void partialButterflyInverse16(Short *src,Short *dst,Int shift, Int line)
     641    dst ++;
     642
     643  }
     644}
     645
     646/** 16x16 inverse transform implemented using partial butterfly structure (1D)
     647 *  \param src            input data (transform coefficients)
     648 *  \param dst            output data (residual)
     649 *  \param shift          specifies right shift after 1D transform
     650 *  \param line
     651 *  \param outputMinimum  minimum for clipping
     652 *  \param outputMaximum  maximum for clipping
     653 */
     654Void partialButterflyInverse16(TCoeff *src, TCoeff *dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
    615655{
    616656  Int j,k;
    617   Int E[8],O[8];
    618   Int EE[4],EO[4];
    619   Int EEE[2],EEO[2];
    620   Int add = 1<<(shift-1);
     657  TCoeff E[8],O[8];
     658  TCoeff EE[4],EO[4];
     659  TCoeff EEE[2],EEO[2];
     660  TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
    621661
    622662  for (j=0; j<line; j++)
    623   {   
     663  {
    624664    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    625665    for (k=0;k<8;k++)
    626666    {
    627       O[k] = g_aiT16[ 1][k]*src[ line] + g_aiT16[ 3][k]*src[ 3*line] + g_aiT16[ 5][k]*src[ 5*line] + g_aiT16[ 7][k]*src[ 7*line] +
    628         g_aiT16[ 9][k]*src[ 9*line] + g_aiT16[11][k]*src[11*line] + g_aiT16[13][k]*src[13*line] + g_aiT16[15][k]*src[15*line];
     667      O[k] = g_aiT16[TRANSFORM_INVERSE][ 1][k]*src[ line]   + g_aiT16[TRANSFORM_INVERSE][ 3][k]*src[ 3*line] +
     668             g_aiT16[TRANSFORM_INVERSE][ 5][k]*src[ 5*line] + g_aiT16[TRANSFORM_INVERSE][ 7][k]*src[ 7*line] +
     669             g_aiT16[TRANSFORM_INVERSE][ 9][k]*src[ 9*line] + g_aiT16[TRANSFORM_INVERSE][11][k]*src[11*line] +
     670             g_aiT16[TRANSFORM_INVERSE][13][k]*src[13*line] + g_aiT16[TRANSFORM_INVERSE][15][k]*src[15*line];
    629671    }
    630672    for (k=0;k<4;k++)
    631673    {
    632       EO[k] = g_aiT16[ 2][k]*src[ 2*line] + g_aiT16[ 6][k]*src[ 6*line] + g_aiT16[10][k]*src[10*line] + g_aiT16[14][k]*src[14*line];
    633     }
    634     EEO[0] = g_aiT16[4][0]*src[ 4*line ] + g_aiT16[12][0]*src[ 12*line ];
    635     EEE[0] = g_aiT16[0][0]*src[ 0      ] + g_aiT16[ 8][0]*src[ 8*line  ];
    636     EEO[1] = g_aiT16[4][1]*src[ 4*line ] + g_aiT16[12][1]*src[ 12*line ];
    637     EEE[1] = g_aiT16[0][1]*src[ 0      ] + g_aiT16[ 8][1]*src[ 8*line  ];
    638 
    639     /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
     674      EO[k] = g_aiT16[TRANSFORM_INVERSE][ 2][k]*src[ 2*line] + g_aiT16[TRANSFORM_INVERSE][ 6][k]*src[ 6*line] +
     675              g_aiT16[TRANSFORM_INVERSE][10][k]*src[10*line] + g_aiT16[TRANSFORM_INVERSE][14][k]*src[14*line];
     676    }
     677    EEO[0] = g_aiT16[TRANSFORM_INVERSE][4][0]*src[ 4*line ] + g_aiT16[TRANSFORM_INVERSE][12][0]*src[ 12*line ];
     678    EEE[0] = g_aiT16[TRANSFORM_INVERSE][0][0]*src[ 0      ] + g_aiT16[TRANSFORM_INVERSE][ 8][0]*src[ 8*line  ];
     679    EEO[1] = g_aiT16[TRANSFORM_INVERSE][4][1]*src[ 4*line ] + g_aiT16[TRANSFORM_INVERSE][12][1]*src[ 12*line ];
     680    EEE[1] = g_aiT16[TRANSFORM_INVERSE][0][1]*src[ 0      ] + g_aiT16[TRANSFORM_INVERSE][ 8][1]*src[ 8*line  ];
     681
     682    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
    640683    for (k=0;k<2;k++)
    641684    {
    642685      EE[k] = EEE[k] + EEO[k];
    643686      EE[k+2] = EEE[1-k] - EEO[1-k];
    644     }   
     687    }
    645688    for (k=0;k<4;k++)
    646689    {
    647690      E[k] = EE[k] + EO[k];
    648691      E[k+4] = EE[3-k] - EO[3-k];
    649     }   
     692    }
    650693    for (k=0;k<8;k++)
    651694    {
    652       dst[k]   = Clip3( -32768, 32767, (E[k] + O[k] + add)>>shift );
    653       dst[k+8] = Clip3( -32768, 32767, (E[7-k] - O[7-k] + add)>>shift );
    654     }   
    655     src ++; 
     695      dst[k]   = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
     696      dst[k+8] = Clip3( outputMinimum, outputMaximum, (E[7-k] - O[7-k] + add)>>shift );
     697    }
     698    src ++;
    656699    dst += 16;
    657700  }
    658701}
    659702
    660 
    661 void partialButterfly32(Short *src,Short *dst,Int shift, Int line)
     703/** 32x32 forward transform implemented using partial butterfly structure (1D)
     704 *  \param src   input data (residual)
     705 *  \param dst   output data (transform coefficients)
     706 *  \param shift specifies right shift after 1D transform
     707 *  \param line
     708 */
     709Void partialButterfly32(TCoeff *src, TCoeff *dst, Int shift, Int line)
    662710{
    663711  Int j,k;
    664   Int E[16],O[16];
    665   Int EE[8],EO[8];
    666   Int EEE[4],EEO[4];
    667   Int EEEE[2],EEEO[2];
    668   Int add = 1<<(shift-1);
     712  TCoeff E[16],O[16];
     713  TCoeff EE[8],EO[8];
     714  TCoeff EEE[4],EEO[4];
     715  TCoeff EEEE[2],EEEO[2];
     716  TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
    669717
    670718  for (j=0; j<line; j++)
    671   {   
     719  {
    672720    /* E and O*/
    673721    for (k=0;k<16;k++)
     
    675723      E[k] = src[k] + src[31-k];
    676724      O[k] = src[k] - src[31-k];
    677     } 
     725    }
    678726    /* EE and EO */
    679727    for (k=0;k<8;k++)
     
    689737    }
    690738    /* EEEE and EEEO */
    691     EEEE[0] = EEE[0] + EEE[3];   
     739    EEEE[0] = EEE[0] + EEE[3];
    692740    EEEO[0] = EEE[0] - EEE[3];
    693741    EEEE[1] = EEE[1] + EEE[2];
    694742    EEEO[1] = EEE[1] - EEE[2];
    695743
    696     dst[ 0       ] = (g_aiT32[ 0][0]*EEEE[0] + g_aiT32[ 0][1]*EEEE[1] + add)>>shift;
    697     dst[ 16*line ] = (g_aiT32[16][0]*EEEE[0] + g_aiT32[16][1]*EEEE[1] + add)>>shift;
    698     dst[ 8*line  ] = (g_aiT32[ 8][0]*EEEO[0] + g_aiT32[ 8][1]*EEEO[1] + add)>>shift;
    699     dst[ 24*line ] = (g_aiT32[24][0]*EEEO[0] + g_aiT32[24][1]*EEEO[1] + add)>>shift;
     744    dst[ 0       ] = (g_aiT32[TRANSFORM_FORWARD][ 0][0]*EEEE[0] + g_aiT32[TRANSFORM_FORWARD][ 0][1]*EEEE[1] + add)>>shift;
     745    dst[ 16*line ] = (g_aiT32[TRANSFORM_FORWARD][16][0]*EEEE[0] + g_aiT32[TRANSFORM_FORWARD][16][1]*EEEE[1] + add)>>shift;
     746    dst[ 8*line  ] = (g_aiT32[TRANSFORM_FORWARD][ 8][0]*EEEO[0] + g_aiT32[TRANSFORM_FORWARD][ 8][1]*EEEO[1] + add)>>shift;
     747    dst[ 24*line ] = (g_aiT32[TRANSFORM_FORWARD][24][0]*EEEO[0] + g_aiT32[TRANSFORM_FORWARD][24][1]*EEEO[1] + add)>>shift;
    700748    for (k=4;k<32;k+=8)
    701749    {
    702       dst[ k*line ] = (g_aiT32[k][0]*EEO[0] + g_aiT32[k][1]*EEO[1] + g_aiT32[k][2]*EEO[2] + g_aiT32[k][3]*EEO[3] + add)>>shift;
    703     }       
     750      dst[ k*line ] = (g_aiT32[TRANSFORM_FORWARD][k][0]*EEO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EEO[1] +
     751                       g_aiT32[TRANSFORM_FORWARD][k][2]*EEO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]*EEO[3] + add)>>shift;
     752    }
    704753    for (k=2;k<32;k+=4)
    705754    {
    706       dst[ k*line ] = (g_aiT32[k][0]*EO[0] + g_aiT32[k][1]*EO[1] + g_aiT32[k][2]*EO[2] + g_aiT32[k][3]*EO[3] +
    707         g_aiT32[k][4]*EO[4] + g_aiT32[k][5]*EO[5] + g_aiT32[k][6]*EO[6] + g_aiT32[k][7]*EO[7] + add)>>shift;
    708     }       
     755      dst[ k*line ] = (g_aiT32[TRANSFORM_FORWARD][k][0]*EO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EO[1] +
     756                       g_aiT32[TRANSFORM_FORWARD][k][2]*EO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]*EO[3] +
     757                       g_aiT32[TRANSFORM_FORWARD][k][4]*EO[4] + g_aiT32[TRANSFORM_FORWARD][k][5]*EO[5] +
     758                       g_aiT32[TRANSFORM_FORWARD][k][6]*EO[6] + g_aiT32[TRANSFORM_FORWARD][k][7]*EO[7] + add)>>shift;
     759    }
    709760    for (k=1;k<32;k+=2)
    710761    {
    711       dst[ k*line ] = (g_aiT32[k][ 0]*O[ 0] + g_aiT32[k][ 1]*O[ 1] + g_aiT32[k][ 2]*O[ 2] + g_aiT32[k][ 3]*O[ 3] +
    712         g_aiT32[k][ 4]*O[ 4] + g_aiT32[k][ 5]*O[ 5] + g_aiT32[k][ 6]*O[ 6] + g_aiT32[k][ 7]*O[ 7] +
    713         g_aiT32[k][ 8]*O[ 8] + g_aiT32[k][ 9]*O[ 9] + g_aiT32[k][10]*O[10] + g_aiT32[k][11]*O[11] +
    714         g_aiT32[k][12]*O[12] + g_aiT32[k][13]*O[13] + g_aiT32[k][14]*O[14] + g_aiT32[k][15]*O[15] + add)>>shift;
    715     }
     762      dst[ k*line ] = (g_aiT32[TRANSFORM_FORWARD][k][ 0]*O[ 0] + g_aiT32[TRANSFORM_FORWARD][k][ 1]*O[ 1] +
     763                       g_aiT32[TRANSFORM_FORWARD][k][ 2]*O[ 2] + g_aiT32[TRANSFORM_FORWARD][k][ 3]*O[ 3] +
     764                       g_aiT32[TRANSFORM_FORWARD][k][ 4]*O[ 4] + g_aiT32[TRANSFORM_FORWARD][k][ 5]*O[ 5] +
     765                       g_aiT32[TRANSFORM_FORWARD][k][ 6]*O[ 6] + g_aiT32[TRANSFORM_FORWARD][k][ 7]*O[ 7] +
     766                       g_aiT32[TRANSFORM_FORWARD][k][ 8]*O[ 8] + g_aiT32[TRANSFORM_FORWARD][k][ 9]*O[ 9] +
     767                       g_aiT32[TRANSFORM_FORWARD][k][10]*O[10] + g_aiT32[TRANSFORM_FORWARD][k][11]*O[11] +
     768                       g_aiT32[TRANSFORM_FORWARD][k][12]*O[12] + g_aiT32[TRANSFORM_FORWARD][k][13]*O[13] +
     769                       g_aiT32[TRANSFORM_FORWARD][k][14]*O[14] + g_aiT32[TRANSFORM_FORWARD][k][15]*O[15] + add)>>shift;
     770    }
     771
    716772    src += 32;
    717773    dst ++;
     
    719775}
    720776
    721 
    722 void partialButterflyInverse32(Short *src,Short *dst,Int shift, Int line)
     777/** 32x32 inverse transform implemented using partial butterfly structure (1D)
     778 *  \param src   input data (transform coefficients)
     779 *  \param dst   output data (residual)
     780 *  \param shift specifies right shift after 1D transform
     781 *  \param line
     782 *  \param outputMinimum  minimum for clipping
     783 *  \param outputMaximum  maximum for clipping
     784 */
     785Void partialButterflyInverse32(TCoeff *src, TCoeff *dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
    723786{
    724787  Int j,k;
    725   Int E[16],O[16];
    726   Int EE[8],EO[8];
    727   Int EEE[4],EEO[4];
    728   Int EEEE[2],EEEO[2];
    729   Int add = 1<<(shift-1);
     788  TCoeff E[16],O[16];
     789  TCoeff EE[8],EO[8];
     790  TCoeff EEE[4],EEO[4];
     791  TCoeff EEEE[2],EEEO[2];
     792  TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
    730793
    731794  for (j=0; j<line; j++)
    732   {   
     795  {
    733796    /* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
    734797    for (k=0;k<16;k++)
    735798    {
    736       O[k] = g_aiT32[ 1][k]*src[ line  ] + g_aiT32[ 3][k]*src[ 3*line  ] + g_aiT32[ 5][k]*src[ 5*line  ] + g_aiT32[ 7][k]*src[ 7*line  ] +
    737         g_aiT32[ 9][k]*src[ 9*line  ] + g_aiT32[11][k]*src[ 11*line ] + g_aiT32[13][k]*src[ 13*line ] + g_aiT32[15][k]*src[ 15*line ] +
    738         g_aiT32[17][k]*src[ 17*line ] + g_aiT32[19][k]*src[ 19*line ] + g_aiT32[21][k]*src[ 21*line ] + g_aiT32[23][k]*src[ 23*line ] +
    739         g_aiT32[25][k]*src[ 25*line ] + g_aiT32[27][k]*src[ 27*line ] + g_aiT32[29][k]*src[ 29*line ] + g_aiT32[31][k]*src[ 31*line ];
     799      O[k] = g_aiT32[TRANSFORM_INVERSE][ 1][k]*src[ line    ] + g_aiT32[TRANSFORM_INVERSE][ 3][k]*src[ 3*line  ] +
     800             g_aiT32[TRANSFORM_INVERSE][ 5][k]*src[ 5*line  ] + g_aiT32[TRANSFORM_INVERSE][ 7][k]*src[ 7*line  ] +
     801             g_aiT32[TRANSFORM_INVERSE][ 9][k]*src[ 9*line  ] + g_aiT32[TRANSFORM_INVERSE][11][k]*src[ 11*line ] +
     802             g_aiT32[TRANSFORM_INVERSE][13][k]*src[ 13*line ] + g_aiT32[TRANSFORM_INVERSE][15][k]*src[ 15*line ] +
     803             g_aiT32[TRANSFORM_INVERSE][17][k]*src[ 17*line ] + g_aiT32[TRANSFORM_INVERSE][19][k]*src[ 19*line ] +
     804             g_aiT32[TRANSFORM_INVERSE][21][k]*src[ 21*line ] + g_aiT32[TRANSFORM_INVERSE][23][k]*src[ 23*line ] +
     805             g_aiT32[TRANSFORM_INVERSE][25][k]*src[ 25*line ] + g_aiT32[TRANSFORM_INVERSE][27][k]*src[ 27*line ] +
     806             g_aiT32[TRANSFORM_INVERSE][29][k]*src[ 29*line ] + g_aiT32[TRANSFORM_INVERSE][31][k]*src[ 31*line ];
    740807    }
    741808    for (k=0;k<8;k++)
    742809    {
    743       EO[k] = g_aiT32[ 2][k]*src[ 2*line  ] + g_aiT32[ 6][k]*src[ 6*line  ] + g_aiT32[10][k]*src[ 10*line ] + g_aiT32[14][k]*src[ 14*line ] +
    744         g_aiT32[18][k]*src[ 18*line ] + g_aiT32[22][k]*src[ 22*line ] + g_aiT32[26][k]*src[ 26*line ] + g_aiT32[30][k]*src[ 30*line ];
     810      EO[k] = g_aiT32[TRANSFORM_INVERSE][ 2][k]*src[ 2*line  ] + g_aiT32[TRANSFORM_INVERSE][ 6][k]*src[ 6*line  ] +
     811              g_aiT32[TRANSFORM_INVERSE][10][k]*src[ 10*line ] + g_aiT32[TRANSFORM_INVERSE][14][k]*src[ 14*line ] +
     812              g_aiT32[TRANSFORM_INVERSE][18][k]*src[ 18*line ] + g_aiT32[TRANSFORM_INVERSE][22][k]*src[ 22*line ] +
     813              g_aiT32[TRANSFORM_INVERSE][26][k]*src[ 26*line ] + g_aiT32[TRANSFORM_INVERSE][30][k]*src[ 30*line ];
    745814    }
    746815    for (k=0;k<4;k++)
    747816    {
    748       EEO[k] = g_aiT32[4][k]*src[ 4*line ] + g_aiT32[12][k]*src[ 12*line ] + g_aiT32[20][k]*src[ 20*line ] + g_aiT32[28][k]*src[ 28*line ];
    749     }
    750     EEEO[0] = g_aiT32[8][0]*src[ 8*line ] + g_aiT32[24][0]*src[ 24*line ];
    751     EEEO[1] = g_aiT32[8][1]*src[ 8*line ] + g_aiT32[24][1]*src[ 24*line ];
    752     EEEE[0] = g_aiT32[0][0]*src[ 0      ] + g_aiT32[16][0]*src[ 16*line ];   
    753     EEEE[1] = g_aiT32[0][1]*src[ 0      ] + g_aiT32[16][1]*src[ 16*line ];
     817      EEO[k] = g_aiT32[TRANSFORM_INVERSE][ 4][k]*src[  4*line ] + g_aiT32[TRANSFORM_INVERSE][12][k]*src[ 12*line ] +
     818               g_aiT32[TRANSFORM_INVERSE][20][k]*src[ 20*line ] + g_aiT32[TRANSFORM_INVERSE][28][k]*src[ 28*line ];
     819    }
     820    EEEO[0] = g_aiT32[TRANSFORM_INVERSE][8][0]*src[ 8*line ] + g_aiT32[TRANSFORM_INVERSE][24][0]*src[ 24*line ];
     821    EEEO[1] = g_aiT32[TRANSFORM_INVERSE][8][1]*src[ 8*line ] + g_aiT32[TRANSFORM_INVERSE][24][1]*src[ 24*line ];
     822    EEEE[0] = g_aiT32[TRANSFORM_INVERSE][0][0]*src[ 0      ] + g_aiT32[TRANSFORM_INVERSE][16][0]*src[ 16*line ];
     823    EEEE[1] = g_aiT32[TRANSFORM_INVERSE][0][1]*src[ 0      ] + g_aiT32[TRANSFORM_INVERSE][16][1]*src[ 16*line ];
    754824
    755825    /* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
     
    757827    EEE[3] = EEEE[0] - EEEO[0];
    758828    EEE[1] = EEEE[1] + EEEO[1];
    759     EEE[2] = EEEE[1] - EEEO[1];   
     829    EEE[2] = EEEE[1] - EEEO[1];
    760830    for (k=0;k<4;k++)
    761831    {
    762832      EE[k] = EEE[k] + EEO[k];
    763833      EE[k+4] = EEE[3-k] - EEO[3-k];
    764     }   
     834    }
    765835    for (k=0;k<8;k++)
    766836    {
    767837      E[k] = EE[k] + EO[k];
    768838      E[k+8] = EE[7-k] - EO[7-k];
    769     }   
     839    }
    770840    for (k=0;k<16;k++)
    771841    {
    772       dst[k]    = Clip3( -32768, 32767, (E[k] + O[k] + add)>>shift );
    773       dst[k+16] = Clip3( -32768, 32767, (E[15-k] - O[15-k] + add)>>shift );
     842      dst[k]    = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
     843      dst[k+16] = Clip3( outputMinimum, outputMaximum, (E[15-k] - O[15-k] + add)>>shift );
    774844    }
    775845    src ++;
     
    779849
    780850/** MxN forward transform (2D)
    781 *  \param block input data (residual)
    782 *  \param coeff output data (transform coefficients)
    783 *  \param iWidth input data (width of transform)
    784 *  \param iHeight input data (height of transform)
     851*  \param bitDepth              [in]  bit depth
     852*  \param block                 [in]  residual block
     853*  \param coeff                 [out] transform coefficients
     854*  \param iWidth                [in]  width of transform
     855*  \param iHeight               [in]  height of transform
     856*  \param useDST                [in]
     857*  \param maxLog2TrDynamicRange [in]
     858
    785859*/
    786 void xTrMxN(Int bitDepth, Short *block,Short *coeff, Int iWidth, Int iHeight, UInt uiMode)
    787 {
    788   Int shift_1st = g_aucConvertToBit[iWidth]  + 1 + bitDepth-8; // log2(iWidth) - 1 + g_bitDepth - 8
    789   Int shift_2nd = g_aucConvertToBit[iHeight]  + 8;                   // log2(iHeight) + 6
    790 
    791   Short tmp[ 64 * 64 ];
    792 
    793   if( iWidth == 4 && iHeight == 4)
    794   {
    795     if (uiMode != REG_DCT)
    796     {
    797       fastForwardDst(block,tmp,shift_1st); // Forward DST BY FAST ALGORITHM, block input, tmp output
    798       fastForwardDst(tmp,coeff,shift_2nd); // Forward DST BY FAST ALGORITHM, tmp input, coeff output
    799     }
    800     else
    801     {
    802       partialButterfly4(block, tmp, shift_1st, iHeight);
    803       partialButterfly4(tmp, coeff, shift_2nd, iWidth);
    804     }
    805 
    806   }
    807   else if( iWidth == 8 && iHeight == 8)
    808   {
    809     partialButterfly8( block, tmp, shift_1st, iHeight );
    810     partialButterfly8( tmp, coeff, shift_2nd, iWidth );
    811   }
    812   else if( iWidth == 16 && iHeight == 16)
    813   {
    814     partialButterfly16( block, tmp, shift_1st, iHeight );
    815     partialButterfly16( tmp, coeff, shift_2nd, iWidth );
    816   }
    817   else if( iWidth == 32 && iHeight == 32)
    818   {
    819     partialButterfly32( block, tmp, shift_1st, iHeight );
    820     partialButterfly32( tmp, coeff, shift_2nd, iWidth );
    821   }
    822 }
     860Void xTrMxN(Int bitDepth, TCoeff *block, TCoeff *coeff, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
     861{
     862  const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
     863
     864  const Int shift_1st = ((g_aucConvertToBit[iWidth] + 2) +  bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
     865  const Int shift_2nd = (g_aucConvertToBit[iHeight] + 2) + TRANSFORM_MATRIX_SHIFT;
     866
     867  assert(shift_1st >= 0);
     868  assert(shift_2nd >= 0);
     869
     870  TCoeff tmp[ MAX_TU_SIZE * MAX_TU_SIZE ];
     871
     872  switch (iWidth)
     873  {
     874    case 4:
     875      {
     876        if ((iHeight == 4) && useDST)    // Check for DCT or DST
     877        {
     878           fastForwardDst( block, tmp, shift_1st );
     879        }
     880        else
     881        {
     882          partialButterfly4 ( block, tmp, shift_1st, iHeight );
     883        }
     884      }
     885      break;
     886
     887    case 8:     partialButterfly8 ( block, tmp, shift_1st, iHeight );  break;
     888    case 16:    partialButterfly16( block, tmp, shift_1st, iHeight );  break;
     889    case 32:    partialButterfly32( block, tmp, shift_1st, iHeight );  break;
     890    default:
     891      assert(0); exit (1); break;
     892  }
     893
     894  switch (iHeight)
     895  {
     896    case 4:
     897      {
     898        if ((iWidth == 4) && useDST)    // Check for DCT or DST
     899        {
     900          fastForwardDst( tmp, coeff, shift_2nd );
     901        }
     902        else
     903        {
     904          partialButterfly4 ( tmp, coeff, shift_2nd, iWidth );
     905        }
     906      }
     907      break;
     908
     909    case 8:     partialButterfly8 ( tmp, coeff, shift_2nd, iWidth );    break;
     910    case 16:    partialButterfly16( tmp, coeff, shift_2nd, iWidth );    break;
     911    case 32:    partialButterfly32( tmp, coeff, shift_2nd, iWidth );    break;
     912    default:
     913      assert(0); exit (1); break;
     914  }
     915}
     916
     917
    823918/** MxN inverse transform (2D)
    824 *  \param coeff input data (transform coefficients)
    825 *  \param block output data (residual)
    826 *  \param iWidth input data (width of transform)
    827 *  \param iHeight input data (height of transform)
     919*  \param bitDepth              [in]  bit depth
     920*  \param coeff                 [in]  transform coefficients
     921*  \param block                 [out] residual block
     922*  \param iWidth                [in]  width of transform
     923*  \param iHeight               [in]  height of transform
     924*  \param useDST                [in]
     925*  \param maxLog2TrDynamicRange [in]
    828926*/
    829 void xITrMxN(Int bitDepth, Short *coeff,Short *block, Int iWidth, Int iHeight, UInt uiMode)
    830 {
    831   Int shift_1st = SHIFT_INV_1ST;
    832   Int shift_2nd = SHIFT_INV_2ND - (bitDepth-8);
    833 
    834   Short tmp[ 64*64];
    835   if( iWidth == 4 && iHeight == 4)
    836   {
    837     if (uiMode != REG_DCT)
    838     {
    839       fastInverseDst(coeff,tmp,shift_1st);    // Inverse DST by FAST Algorithm, coeff input, tmp output
    840       fastInverseDst(tmp,block,shift_2nd); // Inverse DST by FAST Algorithm, tmp input, coeff output
    841     }
    842     else
    843     {
    844       partialButterflyInverse4(coeff,tmp,shift_1st,iWidth);
    845       partialButterflyInverse4(tmp,block,shift_2nd,iHeight);
    846     }
    847   }
    848   else if( iWidth == 8 && iHeight == 8)
    849   {
    850     partialButterflyInverse8(coeff,tmp,shift_1st,iWidth);
    851     partialButterflyInverse8(tmp,block,shift_2nd,iHeight);
    852   }
    853   else if( iWidth == 16 && iHeight == 16)
    854   {
    855     partialButterflyInverse16(coeff,tmp,shift_1st,iWidth);
    856     partialButterflyInverse16(tmp,block,shift_2nd,iHeight);
    857   }
    858   else if( iWidth == 32 && iHeight == 32)
    859   {
    860     partialButterflyInverse32(coeff,tmp,shift_1st,iWidth);
    861     partialButterflyInverse32(tmp,block,shift_2nd,iHeight);
    862   }
    863 }
    864 
    865 #endif //MATRIX_MULT
    866 
    867 // To minimize the distortion only. No rate is considered.
    868 Void TComTrQuant::signBitHidingHDQ( TCoeff* pQCoef, TCoeff* pCoef, UInt const *scan, Int* deltaU, Int width, Int height )
    869 {
     927Void xITrMxN(Int bitDepth, TCoeff *coeff, TCoeff *block, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
     928{
     929  const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
     930
     931  Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
     932  Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
     933  const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
     934  const TCoeff clipMaximum =  (1 << maxLog2TrDynamicRange) - 1;
     935
     936  assert(shift_1st >= 0);
     937  assert(shift_2nd >= 0);
     938
     939  TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
     940
     941  switch (iHeight)
     942  {
     943    case 4:
     944      {
     945        if ((iWidth == 4) && useDST)    // Check for DCT or DST
     946        {
     947          fastInverseDst( coeff, tmp, shift_1st, clipMinimum, clipMaximum);
     948        }
     949        else
     950        {
     951          partialButterflyInverse4 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum);
     952        }
     953      }
     954      break;
     955
     956    case  8: partialButterflyInverse8 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
     957    case 16: partialButterflyInverse16( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
     958    case 32: partialButterflyInverse32( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
     959
     960    default:
     961      assert(0); exit (1); break;
     962  }
     963
     964  switch (iWidth)
     965  {
     966    // Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
     967    case 4:
     968      {
     969        if ((iHeight == 4) && useDST)    // Check for DCT or DST
     970        {
     971          fastInverseDst( tmp, block, shift_2nd, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max() );
     972        }
     973        else
     974        {
     975          partialButterflyInverse4 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max());
     976        }
     977      }
     978      break;
     979
     980    case  8: partialButterflyInverse8 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
     981    case 16: partialButterflyInverse16( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
     982    case 32: partialButterflyInverse32( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
     983
     984    default:
     985      assert(0); exit (1); break;
     986  }
     987}
     988
     989
     990// To minimize the distortion only. No rate is considered.
     991Void TComTrQuant::signBitHidingHDQ( TCoeff* pQCoef, TCoeff* pCoef, TCoeff* deltaU, const TUEntropyCodingParameters &codingParameters, const Int maxLog2TrDynamicRange )
     992{
     993  const UInt width     = codingParameters.widthInGroups  << MLS_CG_LOG2_WIDTH;
     994  const UInt height    = codingParameters.heightInGroups << MLS_CG_LOG2_HEIGHT;
     995  const UInt groupSize = 1 << MLS_CG_SIZE;
     996
     997  const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
     998  const TCoeff entropyCodingMaximum =  (1 << maxLog2TrDynamicRange) - 1;
     999
    8701000  Int lastCG = -1;
    8711001  Int absSum = 0 ;
    8721002  Int n ;
    8731003
    874   for( Int subSet = (width*height-1) >> LOG2_SCAN_SET_SIZE; subSet >= 0; subSet-- )
    875   {
    876     Int  subPos     = subSet << LOG2_SCAN_SET_SIZE;
    877     Int  firstNZPosInCG=SCAN_SET_SIZE , lastNZPosInCG=-1 ;
     1004  for( Int subSet = (width*height-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
     1005  {
     1006    Int  subPos = subSet << MLS_CG_SIZE;
     1007    Int  firstNZPosInCG=groupSize , lastNZPosInCG=-1 ;
    8781008    absSum = 0 ;
    8791009
    880     for(n = SCAN_SET_SIZE-1; n >= 0; --n )
    881     {
    882       if( pQCoef[ scan[ n + subPos ]] )
     1010    for(n = groupSize-1; n >= 0; --n )
     1011    {
     1012      if( pQCoef[ codingParameters.scan[ n + subPos ]] )
    8831013      {
    8841014        lastNZPosInCG = n;
     
    8871017    }
    8881018
    889     for(n = 0; n <SCAN_SET_SIZE; n++ )
    890     {
    891       if( pQCoef[ scan[ n + subPos ]] )
     1019    for(n = 0; n <groupSize; n++ )
     1020    {
     1021      if( pQCoef[ codingParameters.scan[ n + subPos ]] )
    8921022      {
    8931023        firstNZPosInCG = n;
     
    8981028    for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
    8991029    {
    900       absSum += pQCoef[ scan[ n + subPos ]];
    901     }
    902 
    903     if(lastNZPosInCG>=0 && lastCG==-1) 
    904     {
    905       lastCG = 1 ; 
     1030      absSum += Int(pQCoef[ codingParameters.scan[ n + subPos ]]);
     1031    }
     1032
     1033    if(lastNZPosInCG>=0 && lastCG==-1)
     1034    {
     1035      lastCG = 1 ;
    9061036    }
    9071037
    9081038    if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
    9091039    {
    910       UInt signbit = (pQCoef[scan[subPos+firstNZPosInCG]]>0?0:1) ;
     1040      UInt signbit = (pQCoef[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1) ;
    9111041      if( signbit!=(absSum&0x1) )  //compare signbit with sum_parity
    9121042      {
    913         Int minCostInc = MAX_INT,  minPos =-1, finalChange=0, curCost=MAX_INT, curChange=0;
    914        
    915         for( n = (lastCG==1?lastNZPosInCG:SCAN_SET_SIZE-1) ; n >= 0; --n )
    916         {
    917           UInt blkPos   = scan[ n+subPos ];
     1043        TCoeff curCost    = std::numeric_limits<TCoeff>::max();
     1044        TCoeff minCostInc = std::numeric_limits<TCoeff>::max();
     1045        Int minPos =-1, finalChange=0, curChange=0;
     1046
     1047        for( n = (lastCG==1?lastNZPosInCG:groupSize-1) ; n >= 0; --n )
     1048        {
     1049          UInt blkPos   = codingParameters.scan[ n+subPos ];
    9181050          if(pQCoef[ blkPos ] != 0 )
    9191051          {
    9201052            if(deltaU[blkPos]>0)
    9211053            {
    922               curCost = - deltaU[blkPos]; 
     1054              curCost = - deltaU[blkPos];
    9231055              curChange=1 ;
    9241056            }
    925             else 
     1057            else
    9261058            {
    9271059              //curChange =-1;
    9281060              if(n==firstNZPosInCG && abs(pQCoef[blkPos])==1)
    9291061              {
    930                 curCost=MAX_INT ;
     1062                curCost = std::numeric_limits<TCoeff>::max();
    9311063              }
    9321064              else
    9331065              {
    934                 curCost = deltaU[blkPos]; 
     1066                curCost = deltaU[blkPos];
    9351067                curChange =-1;
    9361068              }
     
    9441076              if(thisSignBit != signbit )
    9451077              {
    946                 curCost = MAX_INT;
     1078                curCost = std::numeric_limits<TCoeff>::max();
    9471079              }
    9481080              else
    949               { 
     1081              {
    9501082                curCost = - (deltaU[blkPos])  ;
    9511083                curChange = 1 ;
     
    9671099        } //CG loop
    9681100
    969         if(pQCoef[minPos] == 32767 || pQCoef[minPos] == -32768)
     1101        if(pQCoef[minPos] == entropyCodingMaximum || pQCoef[minPos] == entropyCodingMinimum)
    9701102        {
    9711103          finalChange = -1;
     
    9741106        if(pCoef[minPos]>=0)
    9751107        {
    976           pQCoef[minPos] += finalChange ; 
    977         }
    978         else 
    979         { 
     1108          pQCoef[minPos] += finalChange ;
     1109        }
     1110        else
     1111        {
    9801112          pQCoef[minPos] -= finalChange ;
    981         } 
     1113        }
    9821114      } // Hide
    9831115    }
    984     if(lastCG==1) 
     1116    if(lastCG==1)
    9851117    {
    9861118      lastCG=0 ;
     
    9911123}
    9921124
    993 Void TComTrQuant::xQuant( TComDataCU* pcCU,
    994                           Int*        pSrc,
    995                           TCoeff*     pDes,
     1125
     1126Void TComTrQuant::xQuant(       TComTU       &rTu,
     1127                                TCoeff      * pSrc,
     1128                                TCoeff      * pDes,
    9961129#if ADAPTIVE_QP_SELECTION
    997                           Int*&       pArlDes,
    998 #endif
    999                           Int         iWidth,
    1000                           Int         iHeight,
    1001                           UInt&       uiAcSum,
    1002                           TextType    eTType,
    1003                           UInt        uiAbsPartIdx )
    1004 {
    1005   Int*   piCoef    = pSrc;
     1130                                TCoeff      *pArlDes,
     1131#endif
     1132                                TCoeff       &uiAbsSum,
     1133                          const ComponentID   compID,
     1134                          const QpParam      &cQP )
     1135{
     1136  const TComRectangle &rect = rTu.getRect(compID);
     1137  const UInt uiWidth        = rect.width;
     1138  const UInt uiHeight       = rect.height;
     1139  TComDataCU* pcCU          = rTu.getCU();
     1140  const UInt uiAbsPartIdx   = rTu.GetAbsPartIdxTU();
     1141  const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
     1142
     1143  TCoeff* piCoef    = pSrc;
    10061144  TCoeff* piQCoef   = pDes;
    10071145#if ADAPTIVE_QP_SELECTION
    1008   Int*   piArlCCoef = pArlDes;
    1009 #endif
    1010   Int   iAdd = 0;
    1011  
    1012   Bool useRDOQ = pcCU->getTransformSkip(uiAbsPartIdx,eTType) ? m_useRDOQTS:m_useRDOQ;
    1013   if ( useRDOQ && (eTType == TEXT_LUMA || RDOQ_CHROMA))
    1014   {
     1146  TCoeff* piArlCCoef = pArlDes;
     1147#endif
     1148
     1149  const Bool useTransformSkip      = pcCU->getTransformSkip(uiAbsPartIdx, compID);
     1150  const Int  maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
     1151
     1152  Bool useRDOQ = useTransformSkip ? m_useRDOQTS : m_useRDOQ;
     1153  if ( useRDOQ && (isLuma(compID) || RDOQ_CHROMA) )
     1154  {
     1155#if T0196_SELECTIVE_RDOQ
     1156    if ( !m_useSelectiveRDOQ || xNeedRDOQ( rTu, piCoef, compID, cQP ) )
     1157    {
     1158#endif
    10151159#if ADAPTIVE_QP_SELECTION
    1016     xRateDistOptQuant( pcCU, piCoef, pDes, pArlDes, iWidth, iHeight, uiAcSum, eTType, uiAbsPartIdx );
     1160      xRateDistOptQuant( rTu, piCoef, pDes, pArlDes, uiAbsSum, compID, cQP );
    10171161#else
    1018     xRateDistOptQuant( pcCU, piCoef, pDes, iWidth, iHeight, uiAcSum, eTType, uiAbsPartIdx );
     1162      xRateDistOptQuant( rTu, piCoef, pDes, uiAbsSum, compID, cQP );
     1163#endif
     1164#if T0196_SELECTIVE_RDOQ
     1165    }
     1166    else
     1167    {
     1168      memset( pDes, 0, sizeof( TCoeff ) * uiWidth *uiHeight );
     1169      uiAbsSum = 0;
     1170    }
    10191171#endif
    10201172  }
    10211173  else
    10221174  {
    1023     const UInt   log2BlockSize   = g_aucConvertToBit[ iWidth ] + 2;
    1024 
    1025     UInt scanIdx = pcCU->getCoefScanIdx(uiAbsPartIdx, iWidth, eTType==TEXT_LUMA, pcCU->isIntra(uiAbsPartIdx));
    1026     const UInt *scan = g_auiSigLastScan[ scanIdx ][ log2BlockSize - 1 ];
    1027    
    1028     Int deltaU[32*32] ;
     1175    TUEntropyCodingParameters codingParameters;
     1176    getTUEntropyCodingParameters(codingParameters, rTu, compID);
     1177
     1178    const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
     1179    const TCoeff entropyCodingMaximum =  (1 << maxLog2TrDynamicRange) - 1;
     1180
     1181    TCoeff deltaU[MAX_TU_SIZE * MAX_TU_SIZE];
     1182
     1183    const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
     1184
     1185    Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
     1186    assert(scalingListType < SCALING_LIST_NUM);
     1187    Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
     1188
     1189    const Bool enableScalingLists             = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
     1190    const Int  defaultQuantisationCoefficient = g_quantScales[cQP.rem];
     1191
     1192    /* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
     1193     * implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
     1194     * uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
     1195     * Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
     1196     */
     1197
     1198    // Represents scaling through forward transform
     1199    Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
     1200    if (useTransformSkip && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
     1201    {
     1202      iTransformShift = std::max<Int>(0, iTransformShift);
     1203    }
     1204
     1205    const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
     1206    // QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
    10291207
    10301208#if ADAPTIVE_QP_SELECTION
    1031     QpParam cQpBase;
    1032     Int iQpBase = pcCU->getSlice()->getSliceQpBase();
    1033 
    1034     Int qpScaled;
    1035     Int qpBDOffset = (eTType == TEXT_LUMA)? pcCU->getSlice()->getSPS()->getQpBDOffsetY() : pcCU->getSlice()->getSPS()->getQpBDOffsetC();
    1036 
    1037     if(eTType == TEXT_LUMA)
    1038     {
    1039       qpScaled = iQpBase + qpBDOffset;
    1040     }
    1041     else
    1042     {
    1043       Int chromaQPOffset;
    1044       if(eTType == TEXT_CHROMA_U)
    1045       {
    1046         chromaQPOffset = pcCU->getSlice()->getPPS()->getChromaCbQpOffset() + pcCU->getSlice()->getSliceQpDeltaCb();
    1047       }
    1048       else
    1049       {
    1050         chromaQPOffset = pcCU->getSlice()->getPPS()->getChromaCrQpOffset() + pcCU->getSlice()->getSliceQpDeltaCr();
    1051       }
    1052       iQpBase = iQpBase + chromaQPOffset;
    1053      
    1054       qpScaled = Clip3( -qpBDOffset, 57, iQpBase);
    1055 
    1056       if(qpScaled < 0)
    1057       {
    1058         qpScaled = qpScaled +  qpBDOffset;
    1059       }
    1060       else
    1061       {
    1062         qpScaled = g_aucChromaScale[ qpScaled ] + qpBDOffset;
    1063       }
    1064     }
    1065     cQpBase.setQpParam(qpScaled);
    1066 #endif
    1067 
    1068     UInt uiLog2TrSize = g_aucConvertToBit[ iWidth ] + 2;
    1069     Int scalingListType = (pcCU->isIntra(uiAbsPartIdx) ? 0 : 3) + g_eTTable[(Int)eTType];
    1070     assert(scalingListType < SCALING_LIST_NUM);
    1071     Int *piQuantCoeff = 0;
    1072     piQuantCoeff = getQuantCoeff(scalingListType,m_cQP.m_iRem,uiLog2TrSize-2);
    1073 
    1074     UInt uiBitDepth = eTType == TEXT_LUMA ? g_bitDepthY : g_bitDepthC;
    1075     Int iTransformShift = MAX_TR_DYNAMIC_RANGE - uiBitDepth - uiLog2TrSize;  // Represents scaling through forward transform
     1209    Int iQBitsC = MAX_INT;
     1210    Int iAddC   = MAX_INT;
     1211
     1212    if (m_bUseAdaptQpSelect)
     1213    {
     1214      iQBitsC = iQBits - ARL_C_PRECISION;
     1215      iAddC   = 1 << (iQBitsC-1);
     1216    }
     1217#endif
     1218
     1219    const Int iAdd   = (pcCU->getSlice()->getSliceType()==I_SLICE ? 171 : 85) << (iQBits-9);
     1220    const Int qBits8 = iQBits - 8;
     1221
     1222    for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
     1223    {
     1224      const TCoeff iLevel   = piCoef[uiBlockPos];
     1225      const TCoeff iSign    = (iLevel < 0 ? -1: 1);
     1226
     1227      const Int64  tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
    10761228
    10771229#if ADAPTIVE_QP_SELECTION
    1078     Int iQBits = QUANT_SHIFT + cQpBase.m_iPer + iTransformShift;
    1079     iAdd = (pcCU->getSlice()->getSliceType()==I_SLICE ? 171 : 85) << (iQBits-9);
    1080     Int iQBitsC = QUANT_SHIFT + cQpBase.m_iPer + iTransformShift - ARL_C_PRECISION; 
    1081     Int iAddC   = 1 << (iQBitsC-1);
    1082 #else
    1083     Int iQBits = QUANT_SHIFT + m_cQP.m_iPer + iTransformShift;                // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
    1084     iAdd = (pcCU->getSlice()->getSliceType()==I_SLICE ? 171 : 85) << (iQBits-9);
    1085 #endif
    1086 
    1087     Int qBits8 = iQBits-8;
    1088     for( Int n = 0; n < iWidth*iHeight; n++ )
    1089     {
    1090       Int iLevel;
    1091       Int  iSign;
    1092       UInt uiBlockPos = n;
    1093       iLevel  = piCoef[uiBlockPos];
    1094       iSign   = (iLevel < 0 ? -1: 1);     
    1095 
    1096 #if ADAPTIVE_QP_SELECTION
    1097       Int64 tmpLevel = (Int64)abs(iLevel) * piQuantCoeff[uiBlockPos];
    10981230      if( m_bUseAdaptQpSelect )
    10991231      {
    1100         piArlCCoef[uiBlockPos] = (Int)((tmpLevel + iAddC ) >> iQBitsC);
    1101       }
    1102       iLevel = (Int)((tmpLevel + iAdd ) >> iQBits);
    1103       deltaU[uiBlockPos] = (Int)((tmpLevel - (iLevel<<iQBits) )>> qBits8);
    1104 #else
    1105       iLevel = ((Int64)abs(iLevel) * piQuantCoeff[uiBlockPos] + iAdd ) >> iQBits;
    1106       deltaU[uiBlockPos] = (Int)( ((Int64)abs(piCoef[uiBlockPos]) * piQuantCoeff[uiBlockPos] - (iLevel<<iQBits) )>> qBits8 );
    1107 #endif
    1108       uiAcSum += iLevel;
    1109       iLevel *= iSign;       
    1110       piQCoef[uiBlockPos] = Clip3( -32768, 32767, iLevel );
     1232        piArlCCoef[uiBlockPos] = (TCoeff)((tmpLevel + iAddC ) >> iQBitsC);
     1233      }
     1234#endif
     1235
     1236      const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
     1237      deltaU[uiBlockPos] = (TCoeff)((tmpLevel - (quantisedMagnitude<<iQBits) )>> qBits8);
     1238
     1239      uiAbsSum += quantisedMagnitude;
     1240      const TCoeff quantisedCoefficient = quantisedMagnitude * iSign;
     1241
     1242      piQCoef[uiBlockPos] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
    11111243    } // for n
     1244
    11121245    if( pcCU->getSlice()->getPPS()->getSignHideFlag() )
    11131246    {
    1114       if(uiAcSum>=2)
    1115       {
    1116         signBitHidingHDQ( piQCoef, piCoef, scan, deltaU, iWidth, iHeight ) ;
     1247      if(uiAbsSum >= 2) //this prevents TUs with only one coefficient of value 1 from being tested
     1248      {
     1249        signBitHidingHDQ( piQCoef, piCoef, deltaU, codingParameters, maxLog2TrDynamicRange ) ;
    11171250      }
    11181251    }
    11191252  } //if RDOQ
    11201253  //return;
    1121 
    1122 }
    1123 
    1124 Void TComTrQuant::xDeQuant(Int bitDepth, const TCoeff* pSrc, Int* pDes, Int iWidth, Int iHeight, Int scalingListType )
    1125 {
    1126  
    1127   const TCoeff* piQCoef   = pSrc;
    1128   Int*   piCoef    = pDes;
    1129  
    1130   if ( iWidth > (Int)m_uiMaxTrSize )
    1131   {
    1132     iWidth  = m_uiMaxTrSize;
    1133     iHeight = m_uiMaxTrSize;
    1134   }
    1135  
    1136   Int iShift,iAdd,iCoeffQ;
    1137   UInt uiLog2TrSize = g_aucConvertToBit[ iWidth ] + 2;
    1138 
    1139   Int iTransformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - uiLog2TrSize;
    1140 
    1141   iShift = QUANT_IQUANT_SHIFT - QUANT_SHIFT - iTransformShift;
    1142 
    1143   TCoeff clipQCoef;
    1144 
    1145   if(getUseScalingList())
    1146   {
    1147     iShift += 4;
    1148     Int *piDequantCoef = getDequantCoeff(scalingListType,m_cQP.m_iRem,uiLog2TrSize-2);
    1149 
    1150     if(iShift > m_cQP.m_iPer)
    1151     {
    1152       iAdd = 1 << (iShift - m_cQP.m_iPer - 1);
    1153      
    1154       for( Int n = 0; n < iWidth*iHeight; n++ )
    1155       {
    1156         clipQCoef = Clip3( -32768, 32767, piQCoef[n] );
    1157         iCoeffQ = ((clipQCoef * piDequantCoef[n]) + iAdd ) >> (iShift -  m_cQP.m_iPer);
    1158         piCoef[n] = Clip3(-32768,32767,iCoeffQ);
     1254}
     1255
     1256#if T0196_SELECTIVE_RDOQ
     1257Bool TComTrQuant::xNeedRDOQ( TComTU &rTu, TCoeff * pSrc, const ComponentID compID, const QpParam &cQP )
     1258{
     1259  const TComRectangle &rect = rTu.getRect(compID);
     1260  const UInt uiWidth        = rect.width;
     1261  const UInt uiHeight       = rect.height;
     1262  TComDataCU* pcCU          = rTu.getCU();
     1263  const UInt uiAbsPartIdx   = rTu.GetAbsPartIdxTU();
     1264  const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
     1265
     1266  TCoeff* piCoef    = pSrc;
     1267
     1268  const Bool useTransformSkip      = pcCU->getTransformSkip(uiAbsPartIdx, compID);
     1269  const Int  maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
     1270
     1271  const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
     1272
     1273  Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
     1274  assert(scalingListType < SCALING_LIST_NUM);
     1275  Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
     1276
     1277  const Bool enableScalingLists             = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
     1278  const Int  defaultQuantisationCoefficient = g_quantScales[cQP.rem];
     1279
     1280  /* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
     1281    * implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
     1282    * uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
     1283    * Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
     1284    */
     1285
     1286  // Represents scaling through forward transform
     1287  Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
     1288  if (useTransformSkip && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
     1289  {
     1290    iTransformShift = std::max<Int>(0, iTransformShift);
     1291  }
     1292
     1293  const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
     1294  // QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
     1295
     1296  // iAdd is different from the iAdd used in normal quantization
     1297  const Int iAdd   = (compID == COMPONENT_Y ? 171 : 256) << (iQBits-9);
     1298
     1299  for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
     1300  {
     1301    const TCoeff iLevel   = piCoef[uiBlockPos];
     1302    const Int64  tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
     1303    const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
     1304
     1305    if ( quantisedMagnitude != 0 )
     1306    {
     1307      return true;
     1308    }
     1309  } // for n
     1310  return false;
     1311}
     1312#endif
     1313
     1314Void TComTrQuant::xDeQuant(       TComTU        &rTu,
     1315                            const TCoeff       * pSrc,
     1316                                  TCoeff       * pDes,
     1317                            const ComponentID    compID,
     1318                            const QpParam       &cQP )
     1319{
     1320  assert(compID<MAX_NUM_COMPONENT);
     1321
     1322        TComDataCU          *pcCU               = rTu.getCU();
     1323  const UInt                 uiAbsPartIdx       = rTu.GetAbsPartIdxTU();
     1324  const TComRectangle       &rect               = rTu.getRect(compID);
     1325  const UInt                 uiWidth            = rect.width;
     1326  const UInt                 uiHeight           = rect.height;
     1327  const TCoeff        *const piQCoef            = pSrc;
     1328        TCoeff        *const piCoef             = pDes;
     1329  const UInt                 uiLog2TrSize       = rTu.GetEquivalentLog2TrSize(compID);
     1330  const UInt                 numSamplesInBlock  = uiWidth*uiHeight;
     1331  const Int                  maxLog2TrDynamicRange  = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
     1332  const TCoeff               transformMinimum   = -(1 << maxLog2TrDynamicRange);
     1333  const TCoeff               transformMaximum   =  (1 << maxLog2TrDynamicRange) - 1;
     1334  const Bool                 enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
     1335  const Int                  scalingListType    = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
     1336#if O0043_BEST_EFFORT_DECODING
     1337  const Int                  channelBitDepth    = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
     1338#else
     1339  const Int                  channelBitDepth    = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
     1340#endif
     1341
     1342  assert (scalingListType < SCALING_LIST_NUM);
     1343  assert ( uiWidth <= m_uiMaxTrSize );
     1344
     1345  // Represents scaling through forward transform
     1346  const Bool bClipTransformShiftTo0 = (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
     1347  const Int  originalTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
     1348  const Int  iTransformShift        = bClipTransformShiftTo0 ? std::max<Int>(0, originalTransformShift) : originalTransformShift;
     1349
     1350  const Int QP_per = cQP.per;
     1351  const Int QP_rem = cQP.rem;
     1352
     1353  const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
     1354
     1355  if(enableScalingLists)
     1356  {
     1357    //from the dequantisation equation:
     1358    //iCoeffQ                         = ((Intermediate_Int(clipQCoef) * piDequantCoef[deQuantIdx]) + iAdd ) >> rightShift
     1359    //(sizeof(Intermediate_Int) * 8)  =              inputBitDepth    +    dequantCoefBits                   - rightShift
     1360    const UInt             dequantCoefBits     = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
     1361    const UInt             targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
     1362
     1363    const Intermediate_Int inputMinimum        = -(1 << (targetInputBitDepth - 1));
     1364    const Intermediate_Int inputMaximum        =  (1 << (targetInputBitDepth - 1)) - 1;
     1365
     1366    Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
     1367
     1368    if(rightShift > 0)
     1369    {
     1370      const Intermediate_Int iAdd = 1 << (rightShift - 1);
     1371
     1372      for( Int n = 0; n < numSamplesInBlock; n++ )
     1373      {
     1374        const TCoeff           clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
     1375        const Intermediate_Int iCoeffQ   = ((Intermediate_Int(clipQCoef) * piDequantCoef[n]) + iAdd ) >> rightShift;
     1376
     1377        piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
    11591378      }
    11601379    }
    11611380    else
    11621381    {
    1163       for( Int n = 0; n < iWidth*iHeight; n++ )
    1164       {
    1165         clipQCoef = Clip3( -32768, 32767, piQCoef[n] );
    1166         iCoeffQ   = Clip3( -32768, 32767, clipQCoef * piDequantCoef[n] ); // Clip to avoid possible overflow in following shift left operation
    1167         piCoef[n] = Clip3( -32768, 32767, iCoeffQ << ( m_cQP.m_iPer - iShift ) );
     1382      const Int leftShift = -rightShift;
     1383
     1384      for( Int n = 0; n < numSamplesInBlock; n++ )
     1385      {
     1386        const TCoeff           clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
     1387        const Intermediate_Int iCoeffQ   = (Intermediate_Int(clipQCoef) * piDequantCoef[n]) << leftShift;
     1388
     1389        piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
    11681390      }
    11691391    }
     
    11711393  else
    11721394  {
    1173     iAdd = 1 << (iShift-1);
    1174     Int scale = g_invQuantScales[m_cQP.m_iRem] << m_cQP.m_iPer;
    1175 
    1176     for( Int n = 0; n < iWidth*iHeight; n++ )
    1177     {
    1178       clipQCoef = Clip3( -32768, 32767, piQCoef[n] );
    1179       iCoeffQ = ( clipQCoef * scale + iAdd ) >> iShift;
    1180       piCoef[n] = Clip3(-32768,32767,iCoeffQ);
    1181     }
    1182   }
    1183 }
    1184 
    1185 Void TComTrQuant::init( UInt uiMaxTrSize,
    1186                        Bool bUseRDOQ, 
    1187                        Bool bUseRDOQTS,
    1188                        Bool bEnc, Bool useTransformSkipFast
     1395    const Int scale     =  g_invQuantScales[QP_rem];
     1396    const Int scaleBits =     (IQUANT_SHIFT + 1)   ;
     1397
     1398    //from the dequantisation equation:
     1399    //iCoeffQ                         = Intermediate_Int((Int64(clipQCoef) * scale + iAdd) >> rightShift);
     1400    //(sizeof(Intermediate_Int) * 8)  =                    inputBitDepth   + scaleBits      - rightShift
     1401    const UInt             targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
     1402    const Intermediate_Int inputMinimum        = -(1 << (targetInputBitDepth - 1));
     1403    const Intermediate_Int inputMaximum        =  (1 << (targetInputBitDepth - 1)) - 1;
     1404
     1405    if (rightShift > 0)
     1406    {
     1407      const Intermediate_Int iAdd = 1 << (rightShift - 1);
     1408
     1409      for( Int n = 0; n < numSamplesInBlock; n++ )
     1410      {
     1411        const TCoeff           clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
     1412        const Intermediate_Int iCoeffQ   = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
     1413
     1414        piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
     1415      }
     1416    }
     1417    else
     1418    {
     1419      const Int leftShift = -rightShift;
     1420
     1421      for( Int n = 0; n < numSamplesInBlock; n++ )
     1422      {
     1423        const TCoeff           clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
     1424        const Intermediate_Int iCoeffQ   = (Intermediate_Int(clipQCoef) * scale) << leftShift;
     1425
     1426        piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
     1427      }
     1428    }
     1429  }
     1430}
     1431
     1432
     1433Void TComTrQuant::init(   UInt  uiMaxTrSize,
     1434                          Bool  bUseRDOQ,
     1435                          Bool  bUseRDOQTS,
     1436#if T0196_SELECTIVE_RDOQ
     1437                          Bool  useSelectiveRDOQ,
     1438#endif
     1439                          Bool  bEnc,
     1440                          Bool  useTransformSkipFast
    11891441#if ADAPTIVE_QP_SELECTION
    1190                        , Bool bUseAdaptQpSelect
     1442                        , Bool bUseAdaptQpSelect
    11911443#endif
    11921444                       )
     
    11941446  m_uiMaxTrSize  = uiMaxTrSize;
    11951447  m_bEnc         = bEnc;
    1196   m_useRDOQ     = bUseRDOQ;
    1197   m_useRDOQTS     = bUseRDOQTS;
     1448  m_useRDOQ      = bUseRDOQ;
     1449  m_useRDOQTS    = bUseRDOQTS;
     1450#if T0196_SELECTIVE_RDOQ
     1451  m_useSelectiveRDOQ = useSelectiveRDOQ;
     1452#endif
    11981453#if ADAPTIVE_QP_SELECTION
    11991454  m_bUseAdaptQpSelect = bUseAdaptQpSelect;
     
    12021457}
    12031458
    1204 Void TComTrQuant::transformNxN( TComDataCU* pcCU,
    1205                                Pel*        pcResidual,
    1206                                UInt        uiStride,
    1207                                TCoeff*     rpcCoeff,
     1459
     1460Void TComTrQuant::transformNxN(       TComTU        & rTu,
     1461                                const ComponentID     compID,
     1462                                      Pel          *  pcResidual,
     1463                                const UInt            uiStride,
     1464                                      TCoeff       *  rpcCoeff,
    12081465#if ADAPTIVE_QP_SELECTION
    1209                                Int*&       rpcArlCoeff,
    1210 #endif
    1211                                UInt        uiWidth,
    1212                                UInt        uiHeight,
    1213                                UInt&       uiAbsSum,
    1214                                TextType    eTType,
    1215                                UInt        uiAbsPartIdx,
    1216                                Bool        useTransformSkip
    1217                                )
    1218 {
    1219   if (pcCU->getCUTransquantBypass(uiAbsPartIdx))
    1220   {
    1221     uiAbsSum=0;
    1222     for (UInt k = 0; k<uiHeight; k++)
    1223     {
    1224       for (UInt j = 0; j<uiWidth; j++)
    1225       {
    1226         rpcCoeff[k*uiWidth+j]= pcResidual[k*uiStride+j];
    1227         uiAbsSum += abs(pcResidual[k*uiStride+j]);
    1228       }
    1229     }
     1466                                      TCoeff       *  pcArlCoeff,
     1467#endif
     1468                                      TCoeff        & uiAbsSum,
     1469                                const QpParam       & cQP
     1470                              )
     1471{
     1472  const TComRectangle &rect = rTu.getRect(compID);
     1473  const UInt uiWidth        = rect.width;
     1474  const UInt uiHeight       = rect.height;
     1475  TComDataCU* pcCU          = rTu.getCU();
     1476  const UInt uiAbsPartIdx   = rTu.GetAbsPartIdxTU();
     1477  const UInt uiOrgTrDepth   = rTu.GetTransformDepthRel();
     1478
     1479  uiAbsSum=0;
     1480
     1481  RDPCMMode rdpcmMode = RDPCM_OFF;
     1482  rdpcmNxN( rTu, compID, pcResidual, uiStride, cQP, rpcCoeff, uiAbsSum, rdpcmMode );
     1483
     1484  if (rdpcmMode == RDPCM_OFF)
     1485  {
     1486    uiAbsSum = 0;
     1487    //transform and quantise
     1488    if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
     1489    {
     1490      const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
     1491      const UInt uiSizeMinus1   = (uiWidth * uiHeight) - 1;
     1492
     1493      for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
     1494      {
     1495        for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
     1496        {
     1497          const Pel currentSample = pcResidual[(y * uiStride) + x];
     1498
     1499          rpcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = currentSample;
     1500          uiAbsSum += TCoeff(abs(currentSample));
     1501        }
     1502      }
     1503    }
     1504    else
     1505    {
     1506#if DEBUG_TRANSFORM_AND_QUANTISE
     1507      std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to transform\n";
     1508      printBlock(pcResidual, uiWidth, uiHeight, uiStride);
     1509#endif
     1510
     1511      assert( (pcCU->getSlice()->getSPS()->getMaxTrSize() >= uiWidth) );
     1512
     1513      if(pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0)
     1514      {
     1515        xTransformSkip( pcResidual, uiStride, m_plTempCoeff, rTu, compID );
     1516      }
     1517      else
     1518      {
     1519        const Int channelBitDepth=pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
     1520        xT( channelBitDepth, rTu.useDST(compID), pcResidual, uiStride, m_plTempCoeff, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
     1521      }
     1522
     1523#if DEBUG_TRANSFORM_AND_QUANTISE
     1524      std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between transform and quantiser\n";
     1525      printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
     1526#endif
     1527
     1528      xQuant( rTu, m_plTempCoeff, rpcCoeff,
     1529
     1530#if ADAPTIVE_QP_SELECTION
     1531              pcArlCoeff,
     1532#endif
     1533              uiAbsSum, compID, cQP );
     1534
     1535#if DEBUG_TRANSFORM_AND_QUANTISE
     1536      std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of quantiser\n";
     1537      printBlock(rpcCoeff, uiWidth, uiHeight, uiWidth);
     1538#endif
     1539    }
     1540  }
     1541
     1542    //set the CBF
     1543  pcCU->setCbfPartRange((((uiAbsSum > 0) ? 1 : 0) << uiOrgTrDepth), compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
     1544}
     1545
     1546
     1547Void TComTrQuant::invTransformNxN(      TComTU        &rTu,
     1548                                  const ComponentID    compID,
     1549                                        Pel          *pcResidual,
     1550                                  const UInt           uiStride,
     1551                                        TCoeff       * pcCoeff,
     1552                                  const QpParam       &cQP
     1553                                        DEBUG_STRING_FN_DECLAREP(psDebug))
     1554{
     1555  TComDataCU* pcCU=rTu.getCU();
     1556  const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
     1557  const TComRectangle &rect = rTu.getRect(compID);
     1558  const UInt uiWidth = rect.width;
     1559  const UInt uiHeight = rect.height;
     1560
     1561  if (uiWidth != uiHeight) //for intra, the TU will have been split above this level, so this condition won't be true, hence this only affects inter
     1562  {
     1563    //------------------------------------------------
     1564
     1565    //recurse deeper
     1566
     1567    TComTURecurse subTURecurse(rTu, false, TComTU::VERTICAL_SPLIT, true, compID);
     1568
     1569    do
     1570    {
     1571      //------------------
     1572
     1573      const UInt lineOffset = subTURecurse.GetSectionNumber() * subTURecurse.getRect(compID).height;
     1574
     1575      Pel    *subTUResidual     = pcResidual + (lineOffset * uiStride);
     1576      TCoeff *subTUCoefficients = pcCoeff     + (lineOffset * subTURecurse.getRect(compID).width);
     1577
     1578      invTransformNxN(subTURecurse, compID, subTUResidual, uiStride, subTUCoefficients, cQP DEBUG_STRING_PASS_INTO(psDebug));
     1579
     1580      //------------------
     1581
     1582    } while (subTURecurse.nextSection(rTu));
     1583
     1584    //------------------------------------------------
     1585
    12301586    return;
    12311587  }
    1232   UInt uiMode;  //luma intra pred
    1233   if(eTType == TEXT_LUMA && pcCU->getPredictionMode(uiAbsPartIdx) == MODE_INTRA )
    1234   {
    1235     uiMode = pcCU->getLumaIntraDir( uiAbsPartIdx );
     1588
     1589#if DEBUG_STRING
     1590  if (psDebug)
     1591  {
     1592    std::stringstream ss(stringstream::out);
     1593    printBlockToStream(ss, (compID==0)?"###InvTran ip Ch0: " : ((compID==1)?"###InvTran ip Ch1: ":"###InvTran ip Ch2: "), pcCoeff, uiWidth, uiHeight, uiWidth);
     1594    DEBUG_STRING_APPEND((*psDebug), ss.str())
     1595  }
     1596#endif
     1597
     1598  if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
     1599  {
     1600    const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
     1601    const UInt uiSizeMinus1   = (uiWidth * uiHeight) - 1;
     1602
     1603    for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
     1604    {
     1605      for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
     1606      {
     1607        pcResidual[(y * uiStride) + x] = Pel(pcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex]);
     1608      }
     1609    }
    12361610  }
    12371611  else
    12381612  {
    1239     uiMode = REG_DCT;
    1240   }
    1241  
     1613#if DEBUG_TRANSFORM_AND_QUANTISE
     1614    std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to dequantiser\n";
     1615    printBlock(pcCoeff, uiWidth, uiHeight, uiWidth);
     1616#endif
     1617
     1618    xDeQuant(rTu, pcCoeff, m_plTempCoeff, compID, cQP);
     1619
     1620#if DEBUG_TRANSFORM_AND_QUANTISE
     1621    std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between dequantiser and inverse-transform\n";
     1622    printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
     1623#endif
     1624
     1625#if DEBUG_STRING
     1626    if (psDebug)
     1627    {
     1628      std::stringstream ss(stringstream::out);
     1629      printBlockToStream(ss, "###InvTran deq: ", m_plTempCoeff, uiWidth, uiHeight, uiWidth);
     1630      (*psDebug)+=ss.str();
     1631    }
     1632#endif
     1633
     1634    if(pcCU->getTransformSkip(uiAbsPartIdx, compID))
     1635    {
     1636      xITransformSkip( m_plTempCoeff, pcResidual, uiStride, rTu, compID );
     1637
     1638#if DEBUG_STRING
     1639      if (psDebug)
     1640      {
     1641        std::stringstream ss(stringstream::out);
     1642        printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
     1643        (*psDebug)+=ss.str();
     1644        (*psDebug)+="(<- was a Transform-skipped block)\n";
     1645      }
     1646#endif
     1647    }
     1648    else
     1649    {
     1650#if O0043_BEST_EFFORT_DECODING
     1651      const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
     1652#else
     1653      const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
     1654#endif
     1655      xIT( channelBitDepth, rTu.useDST(compID), m_plTempCoeff, pcResidual, uiStride, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
     1656
     1657#if DEBUG_STRING
     1658      if (psDebug)
     1659      {
     1660        std::stringstream ss(stringstream::out);
     1661        printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
     1662        (*psDebug)+=ss.str();
     1663        (*psDebug)+="(<- was a Transformed block)\n";
     1664      }
     1665#endif
     1666    }
     1667
     1668#if DEBUG_TRANSFORM_AND_QUANTISE
     1669    std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of inverse-transform\n";
     1670    printBlock(pcResidual, uiWidth, uiHeight, uiStride);
     1671    g_debugCounter++;
     1672#endif
     1673  }
     1674
     1675  invRdpcmNxN( rTu, compID, pcResidual, uiStride );
     1676}
     1677
     1678Void TComTrQuant::invRecurTransformNxN( const ComponentID compID,
     1679                                        TComYuv *pResidual,
     1680                                        TComTU &rTu)
     1681{
     1682  if (!rTu.ProcessComponentSection(compID))
     1683  {
     1684    return;
     1685  }
     1686
     1687  TComDataCU* pcCU = rTu.getCU();
     1688  UInt absPartIdxTU = rTu.GetAbsPartIdxTU();
     1689  UInt uiTrMode=rTu.GetTransformDepthRel();
     1690  if( (pcCU->getCbf(absPartIdxTU, compID, uiTrMode) == 0) && (isLuma(compID) || !pcCU->getSlice()->getPPS()->getPpsRangeExtension().getCrossComponentPredictionEnabledFlag()) )
     1691  {
     1692    return;
     1693  }
     1694
     1695  if( uiTrMode == pcCU->getTransformIdx( absPartIdxTU ) )
     1696  {
     1697    const TComRectangle &tuRect      = rTu.getRect(compID);
     1698    const Int            uiStride    = pResidual->getStride( compID );
     1699          Pel           *rpcResidual = pResidual->getAddr( compID );
     1700          UInt           uiAddr      = (tuRect.x0 + uiStride*tuRect.y0);
     1701          Pel           *pResi       = rpcResidual + uiAddr;
     1702          TCoeff        *pcCoeff     = pcCU->getCoeff(compID) + rTu.getCoefficientOffset(compID);
     1703
     1704    const QpParam cQP(*pcCU, compID);
     1705
     1706    if(pcCU->getCbf(absPartIdxTU, compID, uiTrMode) != 0)
     1707    {
     1708      DEBUG_STRING_NEW(sTemp)
     1709#if DEBUG_STRING
     1710      std::string *psDebug=((DebugOptionList::DebugString_InvTran.getInt()&(pcCU->isIntra(absPartIdxTU)?1:(pcCU->isInter(absPartIdxTU)?2:4)))!=0) ? &sTemp : 0;
     1711#endif
     1712
     1713      invTransformNxN( rTu, compID, pResi, uiStride, pcCoeff, cQP DEBUG_STRING_PASS_INTO(psDebug) );
     1714
     1715#if DEBUG_STRING
     1716      if (psDebug != 0)
     1717      {
     1718        std::cout << (*psDebug);
     1719      }
     1720#endif
     1721    }
     1722
     1723    if (isChroma(compID) && (pcCU->getCrossComponentPredictionAlpha(absPartIdxTU, compID) != 0))
     1724    {
     1725      const Pel *piResiLuma = pResidual->getAddr( COMPONENT_Y );
     1726      const Int  strideLuma = pResidual->getStride( COMPONENT_Y );
     1727      const Int  tuWidth    = rTu.getRect( compID ).width;
     1728      const Int  tuHeight   = rTu.getRect( compID ).height;
     1729
     1730      if(pcCU->getCbf(absPartIdxTU, COMPONENT_Y, uiTrMode) != 0)
     1731      {
     1732        pResi = rpcResidual + uiAddr;
     1733        const Pel *pResiLuma = piResiLuma + uiAddr;
     1734
     1735        crossComponentPrediction( rTu, compID, pResiLuma, pResi, pResi, tuWidth, tuHeight, strideLuma, uiStride, uiStride, true );
     1736      }
     1737    }
     1738  }
     1739  else
     1740  {
     1741    TComTURecurse tuRecurseChild(rTu, false);
     1742    do
     1743    {
     1744      invRecurTransformNxN( compID, pResidual, tuRecurseChild );
     1745    } while (tuRecurseChild.nextSection(rTu));
     1746  }
     1747}
     1748
     1749Void TComTrQuant::applyForwardRDPCM( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, const RDPCMMode mode )
     1750{
     1751  TComDataCU *pcCU=rTu.getCU();
     1752  const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
     1753
     1754  const Bool bLossless      = pcCU->getCUTransquantBypass( uiAbsPartIdx );
     1755  const UInt uiWidth        = rTu.getRect(compID).width;
     1756  const UInt uiHeight       = rTu.getRect(compID).height;
     1757  const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
     1758  const UInt uiSizeMinus1   = (uiWidth * uiHeight) - 1;
     1759
     1760  UInt uiX = 0;
     1761  UInt uiY = 0;
     1762
     1763        UInt &majorAxis             = (mode == RDPCM_VER) ? uiX      : uiY;
     1764        UInt &minorAxis             = (mode == RDPCM_VER) ? uiY      : uiX;
     1765  const UInt  majorAxisLimit        = (mode == RDPCM_VER) ? uiWidth  : uiHeight;
     1766  const UInt  minorAxisLimit        = (mode == RDPCM_VER) ? uiHeight : uiWidth;
     1767
     1768  const Bool bUseHalfRoundingPoint  = (mode != RDPCM_OFF);
     1769
    12421770  uiAbsSum = 0;
    1243   assert( (pcCU->getSlice()->getSPS()->getMaxTrSize() >= uiWidth) );
    1244   Int bitDepth = eTType == TEXT_LUMA ? g_bitDepthY : g_bitDepthC;
    1245   if(useTransformSkip)
    1246   {
    1247     xTransformSkip(bitDepth, pcResidual, uiStride, m_plTempCoeff, uiWidth, uiHeight );
    1248   }
    1249   else
    1250   {
    1251     xT(bitDepth, uiMode, pcResidual, uiStride, m_plTempCoeff, uiWidth, uiHeight );
    1252   }
    1253   xQuant( pcCU, m_plTempCoeff, rpcCoeff,
    1254 #if ADAPTIVE_QP_SELECTION
    1255        rpcArlCoeff,
    1256 #endif
    1257        uiWidth, uiHeight, uiAbsSum, eTType, uiAbsPartIdx );
    1258 }
    1259 
    1260 Void TComTrQuant::invtransformNxN( Bool transQuantBypass, TextType eText, UInt uiMode,Pel* rpcResidual, UInt uiStride, TCoeff*   pcCoeff, UInt uiWidth, UInt uiHeight,  Int scalingListType, Bool useTransformSkip )
    1261 {
    1262   if(transQuantBypass)
    1263   {
    1264     for (UInt k = 0; k<uiHeight; k++)
    1265     {
    1266       for (UInt j = 0; j<uiWidth; j++)
    1267       {
    1268         rpcResidual[k*uiStride+j] = pcCoeff[k*uiWidth+j];
    1269       }
    1270     }
    1271     return;
    1272   }
    1273   Int bitDepth = eText == TEXT_LUMA ? g_bitDepthY : g_bitDepthC;
    1274   xDeQuant(bitDepth, pcCoeff, m_plTempCoeff, uiWidth, uiHeight, scalingListType);
    1275   if(useTransformSkip == true)
    1276   {
    1277     xITransformSkip(bitDepth, m_plTempCoeff, rpcResidual, uiStride, uiWidth, uiHeight );
    1278   }
    1279   else
    1280   {
    1281     xIT(bitDepth, uiMode, m_plTempCoeff, rpcResidual, uiStride, uiWidth, uiHeight );
    1282   }
    1283 }
    1284 
    1285 Void TComTrQuant::invRecurTransformNxN( TComDataCU* pcCU, UInt uiAbsPartIdx, TextType eTxt, Pel* rpcResidual, UInt uiAddr, UInt uiStride, UInt uiWidth, UInt uiHeight, UInt uiMaxTrMode, UInt uiTrMode, TCoeff* rpcCoeff )
    1286 {
    1287   if( !pcCU->getCbf(uiAbsPartIdx, eTxt, uiTrMode) )
    1288   {
    1289     return;
    1290   } 
    1291   const UInt stopTrMode = pcCU->getTransformIdx( uiAbsPartIdx );
    1292  
    1293   if( uiTrMode == stopTrMode )
    1294   {
    1295     UInt uiDepth      = pcCU->getDepth( uiAbsPartIdx ) + uiTrMode;
    1296     UInt uiLog2TrSize = g_aucConvertToBit[ pcCU->getSlice()->getSPS()->getMaxCUWidth() >> uiDepth ] + 2;
    1297     if( eTxt != TEXT_LUMA && uiLog2TrSize == 2 )
    1298     {
    1299       UInt uiQPDiv = pcCU->getPic()->getNumPartInCU() >> ( ( uiDepth - 1 ) << 1 );
    1300       if( ( uiAbsPartIdx % uiQPDiv ) != 0 )
    1301       {
    1302         return;
    1303       }
    1304       uiWidth  <<= 1;
    1305       uiHeight <<= 1;
    1306     }
    1307     Pel* pResi = rpcResidual + uiAddr;
    1308     Int scalingListType = (pcCU->isIntra(uiAbsPartIdx) ? 0 : 3) + g_eTTable[(Int)eTxt];
    1309     assert(scalingListType < SCALING_LIST_NUM);
    1310     invtransformNxN( pcCU->getCUTransquantBypass(uiAbsPartIdx), eTxt, REG_DCT, pResi, uiStride, rpcCoeff, uiWidth, uiHeight, scalingListType, pcCU->getTransformSkip(uiAbsPartIdx, eTxt) );
    1311   }
    1312   else
    1313   {
    1314     uiTrMode++;
    1315     uiWidth  >>= 1;
    1316     uiHeight >>= 1;
    1317     Int trWidth = uiWidth, trHeight = uiHeight;
    1318     UInt uiAddrOffset = trHeight * uiStride;
    1319     UInt uiCoefOffset = trWidth * trHeight;
    1320     UInt uiPartOffset = pcCU->getTotalNumPart() >> ( uiTrMode << 1 );
    1321     {
    1322       invRecurTransformNxN( pcCU, uiAbsPartIdx, eTxt, rpcResidual, uiAddr                         , uiStride, uiWidth, uiHeight, uiMaxTrMode, uiTrMode, rpcCoeff ); rpcCoeff += uiCoefOffset; uiAbsPartIdx += uiPartOffset;
    1323       invRecurTransformNxN( pcCU, uiAbsPartIdx, eTxt, rpcResidual, uiAddr + trWidth               , uiStride, uiWidth, uiHeight, uiMaxTrMode, uiTrMode, rpcCoeff ); rpcCoeff += uiCoefOffset; uiAbsPartIdx += uiPartOffset;
    1324       invRecurTransformNxN( pcCU, uiAbsPartIdx, eTxt, rpcResidual, uiAddr + uiAddrOffset          , uiStride, uiWidth, uiHeight, uiMaxTrMode, uiTrMode, rpcCoeff ); rpcCoeff += uiCoefOffset; uiAbsPartIdx += uiPartOffset;
    1325       invRecurTransformNxN( pcCU, uiAbsPartIdx, eTxt, rpcResidual, uiAddr + uiAddrOffset + trWidth, uiStride, uiWidth, uiHeight, uiMaxTrMode, uiTrMode, rpcCoeff );
     1771
     1772  for ( majorAxis = 0; majorAxis < majorAxisLimit; majorAxis++ )
     1773  {
     1774    TCoeff accumulatorValue = 0; // 32-bit accumulator
     1775    for ( minorAxis = 0; minorAxis < minorAxisLimit; minorAxis++ )
     1776    {
     1777      const UInt sampleIndex      = (uiY * uiWidth) + uiX;
     1778      const UInt coefficientIndex = (rotateResidual ? (uiSizeMinus1-sampleIndex) : sampleIndex);
     1779      const Pel  currentSample    = pcResidual[(uiY * uiStride) + uiX];
     1780      const TCoeff encoderSideDelta = TCoeff(currentSample) - accumulatorValue;
     1781
     1782      Pel reconstructedDelta;
     1783      if ( bLossless )
     1784      {
     1785        pcCoeff[coefficientIndex] = encoderSideDelta;
     1786        reconstructedDelta        = (Pel) encoderSideDelta;
     1787      }
     1788      else
     1789      {
     1790        transformSkipQuantOneSample(rTu, compID, encoderSideDelta, pcCoeff, coefficientIndex, cQP, bUseHalfRoundingPoint);
     1791        invTrSkipDeQuantOneSample  (rTu, compID, pcCoeff[coefficientIndex], reconstructedDelta, cQP, coefficientIndex);
     1792      }
     1793
     1794      uiAbsSum += abs(pcCoeff[coefficientIndex]);
     1795
     1796      if (mode != RDPCM_OFF)
     1797      {
     1798        accumulatorValue += reconstructedDelta;
     1799      }
     1800    }
     1801  }
     1802}
     1803
     1804Void TComTrQuant::rdpcmNxN   ( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, RDPCMMode& rdpcmMode )
     1805{
     1806  TComDataCU *pcCU=rTu.getCU();
     1807  const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
     1808
     1809  if (!pcCU->isRDPCMEnabled(uiAbsPartIdx) || ((pcCU->getTransformSkip(uiAbsPartIdx, compID) == 0) && !pcCU->getCUTransquantBypass(uiAbsPartIdx)))
     1810  {
     1811    rdpcmMode = RDPCM_OFF;
     1812  }
     1813  else if ( pcCU->isIntra( uiAbsPartIdx ) )
     1814  {
     1815    const ChromaFormat chFmt = pcCU->getPic()->getPicYuvOrg()->getChromaFormat();
     1816    const ChannelType chType = toChannelType(compID);
     1817    const UInt uiChPredMode  = pcCU->getIntraDir( chType, uiAbsPartIdx );
     1818    const TComSPS *sps=pcCU->getSlice()->getSPS();
     1819    const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
     1820    const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
     1821    const UInt uiChFinalMode = ((chFmt == CHROMA_422)       && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
     1822
     1823    if (uiChFinalMode == VER_IDX || uiChFinalMode == HOR_IDX)
     1824    {
     1825      rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
     1826      applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, uiAbsSum, rdpcmMode );
     1827    }
     1828    else
     1829    {
     1830      rdpcmMode = RDPCM_OFF;
     1831    }
     1832  }
     1833  else // not intra, need to select the best mode
     1834  {
     1835    const UInt uiWidth  = rTu.getRect(compID).width;
     1836    const UInt uiHeight = rTu.getRect(compID).height;
     1837
     1838    RDPCMMode bestMode   = NUMBER_OF_RDPCM_MODES;
     1839    TCoeff    bestAbsSum = std::numeric_limits<TCoeff>::max();
     1840    TCoeff    bestCoefficients[MAX_TU_SIZE * MAX_TU_SIZE];
     1841
     1842    for (UInt modeIndex = 0; modeIndex < NUMBER_OF_RDPCM_MODES; modeIndex++)
     1843    {
     1844      const RDPCMMode mode = RDPCMMode(modeIndex);
     1845
     1846      TCoeff currAbsSum = 0;
     1847
     1848      applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, currAbsSum, mode );
     1849
     1850      if (currAbsSum < bestAbsSum)
     1851      {
     1852        bestMode   = mode;
     1853        bestAbsSum = currAbsSum;
     1854        if (mode != RDPCM_OFF)
     1855        {
     1856          memcpy(bestCoefficients, pcCoeff, (uiWidth * uiHeight * sizeof(TCoeff)));
     1857        }
     1858      }
     1859    }
     1860
     1861    rdpcmMode = bestMode;
     1862    uiAbsSum  = bestAbsSum;
     1863
     1864    if (rdpcmMode != RDPCM_OFF) //the TU is re-transformed and quantised if DPCM_OFF is returned, so there is no need to preserve it here
     1865    {
     1866      memcpy(pcCoeff, bestCoefficients, (uiWidth * uiHeight * sizeof(TCoeff)));
     1867    }
     1868  }
     1869
     1870  pcCU->setExplicitRdpcmModePartRange(rdpcmMode, compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
     1871}
     1872
     1873Void TComTrQuant::invRdpcmNxN( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride )
     1874{
     1875  TComDataCU *pcCU=rTu.getCU();
     1876  const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
     1877
     1878  if (pcCU->isRDPCMEnabled( uiAbsPartIdx ) && ((pcCU->getTransformSkip(uiAbsPartIdx, compID ) != 0) || pcCU->getCUTransquantBypass(uiAbsPartIdx)))
     1879  {
     1880    const UInt uiWidth  = rTu.getRect(compID).width;
     1881    const UInt uiHeight = rTu.getRect(compID).height;
     1882
     1883    RDPCMMode rdpcmMode = RDPCM_OFF;
     1884
     1885    if ( pcCU->isIntra( uiAbsPartIdx ) )
     1886    {
     1887      const ChromaFormat chFmt = pcCU->getPic()->getPicYuvRec()->getChromaFormat();
     1888      const ChannelType chType = toChannelType(compID);
     1889      const UInt uiChPredMode  = pcCU->getIntraDir( chType, uiAbsPartIdx );
     1890      const TComSPS *sps=pcCU->getSlice()->getSPS();
     1891      const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
     1892      const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
     1893      const UInt uiChFinalMode = ((chFmt == CHROMA_422)       && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
     1894
     1895      if (uiChFinalMode == VER_IDX || uiChFinalMode == HOR_IDX)
     1896      {
     1897        rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
     1898      }
     1899    }
     1900    else  // not intra case
     1901    {
     1902      rdpcmMode = RDPCMMode(pcCU->getExplicitRdpcmMode( compID, uiAbsPartIdx ));
     1903    }
     1904
     1905    const TCoeff pelMin=(TCoeff) std::numeric_limits<Pel>::min();
     1906    const TCoeff pelMax=(TCoeff) std::numeric_limits<Pel>::max();
     1907    if (rdpcmMode == RDPCM_VER)
     1908    {
     1909      for( UInt uiX = 0; uiX < uiWidth; uiX++ )
     1910      {
     1911        Pel *pcCurResidual = pcResidual+uiX;
     1912        TCoeff accumulator = *pcCurResidual; // 32-bit accumulator
     1913        pcCurResidual+=uiStride;
     1914        for( UInt uiY = 1; uiY < uiHeight; uiY++, pcCurResidual+=uiStride )
     1915        {
     1916          accumulator += *(pcCurResidual);
     1917          *pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
     1918        }
     1919      }
     1920    }
     1921    else if (rdpcmMode == RDPCM_HOR)
     1922    {
     1923      for( UInt uiY = 0; uiY < uiHeight; uiY++ )
     1924      {
     1925        Pel *pcCurResidual = pcResidual+uiY*uiStride;
     1926        TCoeff accumulator = *pcCurResidual;
     1927        pcCurResidual++;
     1928        for( UInt uiX = 1; uiX < uiWidth; uiX++, pcCurResidual++ )
     1929        {
     1930          accumulator += *(pcCurResidual);
     1931          *pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
     1932        }
     1933      }
    13261934    }
    13271935  }
     
    13321940// ------------------------------------------------------------------------------------------------
    13331941
    1334 /** Wrapper function between HM interface and core NxN forward transform (2D)
     1942/** Wrapper function between HM interface and core NxN forward transform (2D)
     1943 *  \param channelBitDepth bit depth of channel
     1944 *  \param useDST
    13351945 *  \param piBlkResi input data (residual)
     1946 *  \param uiStride stride of input residual data
    13361947 *  \param psCoeff output data (transform coefficients)
    1337  *  \param uiStride stride of input residual data
    1338  *  \param iSize transform size (iSize x iSize)
    1339  *  \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
     1948 *  \param iWidth transform width
     1949 *  \param iHeight transform height
     1950 *  \param maxLog2TrDynamicRange
    13401951 */
    1341 Void TComTrQuant::xT(Int bitDepth, UInt uiMode, Pel* piBlkResi, UInt uiStride, Int* psCoeff, Int iWidth, Int iHeight )
    1342 {
    1343 #if MATRIX_MULT 
    1344   Int iSize = iWidth;
    1345   xTr(bitDepth, piBlkResi,psCoeff,uiStride,(UInt)iSize,uiMode);
    1346 #else
    1347   Int j;
    1348   Short block[ 32 * 32 ];
    1349   Short coeff[ 32 * 32 ];
    1350       for (j = 0; j < iHeight; j++)
    1351       {   
    1352         memcpy( block + j * iWidth, piBlkResi + j * uiStride, iWidth * sizeof( Short ) );
    1353       }
    1354     xTrMxN(bitDepth, block, coeff, iWidth, iHeight, uiMode );
    1355     for ( j = 0; j < iHeight * iWidth; j++ )
    1356     {   
    1357       psCoeff[ j ] = coeff[ j ];
    1358     }
    1359 #endif 
    1360 }
    1361 
    1362 
    1363 /** Wrapper function between HM interface and core NxN inverse transform (2D)
     1952Void TComTrQuant::xT( const Int channelBitDepth, Bool useDST, Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
     1953{
     1954#if MATRIX_MULT
     1955  if( iWidth == iHeight)
     1956  {
     1957    xTr(channelBitDepth, piBlkResi, psCoeff, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
     1958    return;
     1959  }
     1960#endif
     1961
     1962  TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
     1963  TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
     1964
     1965  for (Int y = 0; y < iHeight; y++)
     1966  {
     1967    for (Int x = 0; x < iWidth; x++)
     1968    {
     1969      block[(y * iWidth) + x] = piBlkResi[(y * uiStride) + x];
     1970    }
     1971  }
     1972
     1973  xTrMxN( channelBitDepth, block, coeff, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
     1974
     1975  memcpy(psCoeff, coeff, (iWidth * iHeight * sizeof(TCoeff)));
     1976}
     1977
     1978/** Wrapper function between HM interface and core NxN inverse transform (2D)
     1979 *  \param channelBitDepth bit depth of channel
     1980 *  \param useDST
    13641981 *  \param plCoef input data (transform coefficients)
    13651982 *  \param pResidual output data (residual)
    13661983 *  \param uiStride stride of input residual data
    1367  *  \param iSize transform size (iSize x iSize)
    1368  *  \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
     1984 *  \param iWidth transform width
     1985 *  \param iHeight transform height
     1986 *  \param maxLog2TrDynamicRange
    13691987 */
    1370 Void TComTrQuant::xIT(Int bitDepth, UInt uiMode, Int* plCoef, Pel* pResidual, UInt uiStride, Int iWidth, Int iHeight )
    1371 {
    1372 #if MATRIX_MULT 
    1373   Int iSize = iWidth;
    1374   xITr(bitDepth, plCoef,pResidual,uiStride,(UInt)iSize,uiMode);
    1375 #else
    1376   Int j;
    1377   {
    1378     Short block[ 32 * 32 ];
    1379     Short coeff[ 32 * 32 ];
    1380     for ( j = 0; j < iHeight * iWidth; j++ )
    1381     {   
    1382       coeff[j] = (Short)plCoef[j];
    1383     }
    1384     xITrMxN(bitDepth, coeff, block, iWidth, iHeight, uiMode );
    1385     {
    1386       for ( j = 0; j < iHeight; j++ )
    1387       {   
    1388         memcpy( pResidual + j * uiStride, block + j * iWidth, iWidth * sizeof(Short) );
    1389       }
    1390     }
    1391     return ;
    1392   }
    1393 #endif 
    1394 }
    1395  
     1988Void TComTrQuant::xIT( const Int channelBitDepth, Bool useDST, TCoeff* plCoef, Pel* pResidual, UInt uiStride, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
     1989{
     1990#if MATRIX_MULT
     1991  if( iWidth == iHeight )
     1992  {
     1993    xITr(channelBitDepth, plCoef, pResidual, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
     1994    return;
     1995  }
     1996#endif
     1997
     1998  TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
     1999  TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
     2000
     2001  memcpy(coeff, plCoef, (iWidth * iHeight * sizeof(TCoeff)));
     2002
     2003  xITrMxN( channelBitDepth, coeff, block, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
     2004
     2005  for (Int y = 0; y < iHeight; y++)
     2006  {
     2007    for (Int x = 0; x < iWidth; x++)
     2008    {
     2009      pResidual[(y * uiStride) + x] = Pel(block[(y * iWidth) + x]);
     2010    }
     2011  }
     2012}
     2013
    13962014/** Wrapper function between HM interface and core 4x4 transform skipping
    13972015 *  \param piBlkResi input data (residual)
     2016 *  \param uiStride stride of input residual data
    13982017 *  \param psCoeff output data (transform coefficients)
    1399  *  \param uiStride stride of input residual data
    1400  *  \param iSize transform size (iSize x iSize)
     2018 *  \param rTu reference to transform data
     2019 *  \param component colour component
    14012020 */
    1402 Void TComTrQuant::xTransformSkip(Int bitDepth, Pel* piBlkResi, UInt uiStride, Int* psCoeff, Int width, Int height )
    1403 {
    1404   assert( width == height );
    1405   UInt uiLog2TrSize = g_aucConvertToBit[ width ] + 2;
    1406   Int  shift = MAX_TR_DYNAMIC_RANGE - bitDepth - uiLog2TrSize;
    1407   UInt transformSkipShift;
    1408   Int  j,k;
    1409   if(shift >= 0)
    1410   {
    1411     transformSkipShift = shift;
    1412     for (j = 0; j < height; j++)
    1413     {   
    1414       for(k = 0; k < width; k ++)
    1415       {
    1416         psCoeff[j*height + k] = piBlkResi[j * uiStride + k] << transformSkipShift;     
    1417       }
    1418     }
    1419   }
    1420   else
    1421   {
    1422     //The case when uiBitDepth > 13
    1423     Int offset;
    1424     transformSkipShift = -shift;
    1425     offset = (1 << (transformSkipShift - 1));
    1426     for (j = 0; j < height; j++)
    1427     {   
    1428       for(k = 0; k < width; k ++)
    1429       {
    1430         psCoeff[j*height + k] = (piBlkResi[j * uiStride + k] + offset) >> transformSkipShift;     
    1431       }
    1432     }
    1433   }
    1434 }
    1435 
    1436 /** Wrapper function between HM interface and core NxN transform skipping
     2021Void TComTrQuant::xTransformSkip( Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, TComTU &rTu, const ComponentID component )
     2022{
     2023  const TComRectangle &rect = rTu.getRect(component);
     2024  const Int width           = rect.width;
     2025  const Int height          = rect.height;
     2026  const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
     2027  const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
     2028
     2029  Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
     2030  if (rTu.getCU()->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
     2031  {
     2032    iTransformShift = std::max<Int>(0, iTransformShift);
     2033  }
     2034
     2035  const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
     2036  const UInt uiSizeMinus1   = (width * height) - 1;
     2037
     2038  if (iTransformShift >= 0)
     2039  {
     2040    for (UInt y = 0, coefficientIndex = 0; y < height; y++)
     2041    {
     2042      for (UInt x = 0; x < width; x++, coefficientIndex++)
     2043      {
     2044        psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = TCoeff(piBlkResi[(y * uiStride) + x]) << iTransformShift;
     2045      }
     2046    }
     2047  }
     2048  else //for very high bit depths
     2049  {
     2050    iTransformShift = -iTransformShift;
     2051    const TCoeff offset = 1 << (iTransformShift - 1);
     2052
     2053    for (UInt y = 0, coefficientIndex = 0; y < height; y++)
     2054    {
     2055      for (UInt x = 0; x < width; x++, coefficientIndex++)
     2056      {
     2057        psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = (TCoeff(piBlkResi[(y * uiStride) + x]) + offset) >> iTransformShift;
     2058      }
     2059    }
     2060  }
     2061}
     2062
     2063/** Wrapper function between HM interface and core NxN transform skipping
    14372064 *  \param plCoef input data (coefficients)
    14382065 *  \param pResidual output data (residual)
    14392066 *  \param uiStride stride of input residual data
    1440  *  \param iSize transform size (iSize x iSize)
     2067 *  \param rTu reference to transform data
     2068 *  \param component colour component ID
    14412069 */
    1442 Void TComTrQuant::xITransformSkip(Int bitDepth, Int* plCoef, Pel* pResidual, UInt uiStride, Int width, Int height )
    1443 {
    1444   assert( width == height );
    1445   UInt uiLog2TrSize = g_aucConvertToBit[ width ] + 2;
    1446   Int  shift = MAX_TR_DYNAMIC_RANGE - bitDepth - uiLog2TrSize;
    1447   UInt transformSkipShift;
    1448   Int  j,k;
    1449   if(shift > 0)
    1450   {
    1451     Int offset;
    1452     transformSkipShift = shift;
    1453     offset = (1 << (transformSkipShift -1));
    1454     for ( j = 0; j < height; j++ )
    1455     {   
    1456       for(k = 0; k < width; k ++)
    1457       {
    1458         pResidual[j * uiStride + k] =  (plCoef[j*width+k] + offset) >> transformSkipShift;
    1459       }
    1460     }
    1461   }
    1462   else
    1463   {
    1464     //The case when uiBitDepth >= 13
    1465     transformSkipShift = - shift;
    1466     for ( j = 0; j < height; j++ )
    1467     {   
    1468       for(k = 0; k < width; k ++)
    1469       {
    1470         pResidual[j * uiStride + k] =  plCoef[j*width+k] << transformSkipShift;
     2070Void TComTrQuant::xITransformSkip( TCoeff* plCoef, Pel* pResidual, UInt uiStride, TComTU &rTu, const ComponentID component )
     2071{
     2072  const TComRectangle &rect = rTu.getRect(component);
     2073  const Int width           = rect.width;
     2074  const Int height          = rect.height;
     2075  const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
     2076#if O0043_BEST_EFFORT_DECODING
     2077  const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getStreamBitDepth(toChannelType(component));
     2078#else
     2079  const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
     2080#endif
     2081
     2082  Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
     2083  if (rTu.getCU()->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
     2084  {
     2085    iTransformShift = std::max<Int>(0, iTransformShift);
     2086  }
     2087
     2088  const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
     2089  const UInt uiSizeMinus1   = (width * height) - 1;
     2090
     2091  if (iTransformShift >= 0)
     2092  {
     2093    const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
     2094
     2095    for (UInt y = 0, coefficientIndex = 0; y < height; y++)
     2096    {
     2097      for (UInt x = 0; x < width; x++, coefficientIndex++)
     2098      {
     2099        pResidual[(y * uiStride) + x] =  Pel((plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] + offset) >> iTransformShift);
     2100      }
     2101    }
     2102  }
     2103  else //for very high bit depths
     2104  {
     2105    iTransformShift = -iTransformShift;
     2106
     2107    for (UInt y = 0, coefficientIndex = 0; y < height; y++)
     2108    {
     2109      for (UInt x = 0; x < width; x++, coefficientIndex++)
     2110      {
     2111        pResidual[(y * uiStride) + x] = Pel(plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] << iTransformShift);
    14712112      }
    14722113    }
     
    14752116
    14762117/** RDOQ with CABAC
    1477  * \param pcCU pointer to coding unit structure
     2118 * \param rTu reference to transform data
    14782119 * \param plSrcCoeff pointer to input buffer
    14792120 * \param piDstCoeff reference to pointer to output buffer
    1480  * \param uiWidth block width
    1481  * \param uiHeight block height
     2121 * \param piArlDstCoeff
    14822122 * \param uiAbsSum reference to absolute sum of quantized transform coefficient
    1483  * \param eTType plane type / luminance or chrominance
    1484  * \param uiAbsPartIdx absolute partition index
    1485  * \returns Void
     2123 * \param compID colour component ID
     2124 * \param cQP reference to quantization parameters
     2125
    14862126 * Rate distortion optimized quantization for entropy
    14872127 * coding engines using probability models like CABAC
    14882128 */
    1489 Void TComTrQuant::xRateDistOptQuant                 ( TComDataCU*                     pcCU,
    1490                                                       Int*                            plSrcCoeff,
    1491                                                       TCoeff*                        piDstCoeff,
     2129Void TComTrQuant::xRateDistOptQuant                 (       TComTU       &rTu,
     2130                                                            TCoeff      * plSrcCoeff,
     2131                                                            TCoeff      * piDstCoeff,
    14922132#if ADAPTIVE_QP_SELECTION
    1493                                                       Int*&                           piArlDstCoeff,
    1494 #endif
    1495                                                       UInt                            uiWidth,
    1496                                                       UInt                            uiHeight,
    1497                                                       UInt&                           uiAbsSum,
    1498                                                       TextType                        eTType,
    1499                                                       UInt                            uiAbsPartIdx )
    1500 {
    1501   UInt uiLog2TrSize = g_aucConvertToBit[ uiWidth ] + 2;
    1502  
    1503   UInt uiBitDepth = eTType == TEXT_LUMA ? g_bitDepthY : g_bitDepthC;
    1504   Int iTransformShift = MAX_TR_DYNAMIC_RANGE - uiBitDepth - uiLog2TrSize;  // Represents scaling through forward transform
    1505   UInt       uiGoRiceParam       = 0;
    1506   Double     d64BlockUncodedCost = 0;
    1507   const UInt uiLog2BlkSize       = g_aucConvertToBit[ uiWidth ] + 2;
    1508   const UInt uiMaxNumCoeff       = uiWidth * uiHeight;
    1509   Int scalingListType = (pcCU->isIntra(uiAbsPartIdx) ? 0 : 3) + g_eTTable[(Int)eTType];
     2133                                                            TCoeff      * piArlDstCoeff,
     2134#endif
     2135                                                            TCoeff       &uiAbsSum,
     2136                                                      const ComponentID   compID,
     2137                                                      const QpParam      &cQP  )
     2138{
     2139  const TComRectangle  & rect             = rTu.getRect(compID);
     2140  const UInt             uiWidth          = rect.width;
     2141  const UInt             uiHeight         = rect.height;
     2142        TComDataCU    *  pcCU             = rTu.getCU();
     2143  const UInt             uiAbsPartIdx     = rTu.GetAbsPartIdxTU();
     2144  const ChannelType      channelType      = toChannelType(compID);
     2145  const UInt             uiLog2TrSize     = rTu.GetEquivalentLog2TrSize(compID);
     2146
     2147  const Bool             extendedPrecision = pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
     2148  const Int              maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
     2149  const Int              channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(channelType);
     2150
     2151  /* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
     2152   * implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
     2153   * uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
     2154   * Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
     2155   */
     2156
     2157  // Represents scaling through forward transform
     2158  Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
     2159  if ((pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && extendedPrecision)
     2160  {
     2161    iTransformShift = std::max<Int>(0, iTransformShift);
     2162  }
     2163
     2164  const Bool bUseGolombRiceParameterAdaptation = pcCU->getSlice()->getSPS()->getSpsRangeExtension().getPersistentRiceAdaptationEnabledFlag();
     2165  const UInt initialGolombRiceParameter        = m_pcEstBitsSbac->golombRiceAdaptationStatistics[rTu.getGolombRiceStatisticsIndex(compID)] / RExt__GOLOMB_RICE_INCREMENT_DIVISOR;
     2166        UInt uiGoRiceParam                     = initialGolombRiceParameter;
     2167  Double     d64BlockUncodedCost               = 0;
     2168  const UInt uiLog2BlockWidth                  = g_aucConvertToBit[ uiWidth  ] + 2;
     2169  const UInt uiLog2BlockHeight                 = g_aucConvertToBit[ uiHeight ] + 2;
     2170  const UInt uiMaxNumCoeff                     = uiWidth * uiHeight;
     2171  assert(compID<MAX_NUM_COMPONENT);
     2172
     2173  Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
    15102174  assert(scalingListType < SCALING_LIST_NUM);
    1511  
    1512   Int iQBits = QUANT_SHIFT + m_cQP.m_iPer + iTransformShift;                   // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
    1513   Double *pdErrScaleOrg = getErrScaleCoeff(scalingListType,uiLog2TrSize-2,m_cQP.m_iRem);
    1514   Int *piQCoefOrg = getQuantCoeff(scalingListType,m_cQP.m_iRem,uiLog2TrSize-2);
    1515   Int *piQCoef = piQCoefOrg;
    1516   Double *pdErrScale = pdErrScaleOrg;
     2175
     2176#if ADAPTIVE_QP_SELECTION
     2177  memset(piArlDstCoeff, 0, sizeof(TCoeff) *  uiMaxNumCoeff);
     2178#endif
     2179
     2180  Double pdCostCoeff [ MAX_TU_SIZE * MAX_TU_SIZE ];
     2181  Double pdCostSig   [ MAX_TU_SIZE * MAX_TU_SIZE ];
     2182  Double pdCostCoeff0[ MAX_TU_SIZE * MAX_TU_SIZE ];
     2183  memset( pdCostCoeff, 0, sizeof(Double) *  uiMaxNumCoeff );
     2184  memset( pdCostSig,   0, sizeof(Double) *  uiMaxNumCoeff );
     2185  Int rateIncUp   [ MAX_TU_SIZE * MAX_TU_SIZE ];
     2186  Int rateIncDown [ MAX_TU_SIZE * MAX_TU_SIZE ];
     2187  Int sigRateDelta[ MAX_TU_SIZE * MAX_TU_SIZE ];
     2188  TCoeff deltaU   [ MAX_TU_SIZE * MAX_TU_SIZE ];
     2189  memset( rateIncUp,    0, sizeof(Int   ) *  uiMaxNumCoeff );
     2190  memset( rateIncDown,  0, sizeof(Int   ) *  uiMaxNumCoeff );
     2191  memset( sigRateDelta, 0, sizeof(Int   ) *  uiMaxNumCoeff );
     2192  memset( deltaU,       0, sizeof(TCoeff) *  uiMaxNumCoeff );
     2193
     2194  const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;                   // Right shift of non-RDOQ quantizer;  level = (coeff*uiQ + offset)>>q_bits
     2195  const Double *const pdErrScale = getErrScaleCoeff(scalingListType, (uiLog2TrSize-2), cQP.rem);
     2196  const Int    *const piQCoef    = getQuantCoeff(scalingListType, cQP.rem, (uiLog2TrSize-2));
     2197
     2198  const Bool   enableScalingLists             = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
     2199  const Int    defaultQuantisationCoefficient = g_quantScales[cQP.rem];
     2200  const Double defaultErrorScale              = getErrScaleCoeffNoScalingList(scalingListType, (uiLog2TrSize-2), cQP.rem);
     2201
     2202  const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
     2203  const TCoeff entropyCodingMaximum =  (1 << maxLog2TrDynamicRange) - 1;
     2204
    15172205#if ADAPTIVE_QP_SELECTION
    15182206  Int iQBitsC = iQBits - ARL_C_PRECISION;
    15192207  Int iAddC =  1 << (iQBitsC-1);
    15202208#endif
    1521   UInt uiScanIdx = pcCU->getCoefScanIdx(uiAbsPartIdx, uiWidth, eTType==TEXT_LUMA, pcCU->isIntra(uiAbsPartIdx));
    1522  
    1523 #if ADAPTIVE_QP_SELECTION
    1524   memset(piArlDstCoeff, 0, sizeof(Int) *  uiMaxNumCoeff);
    1525 #endif
    1526  
    1527   Double pdCostCoeff [ 32 * 32 ];
    1528   Double pdCostSig   [ 32 * 32 ];
    1529   Double pdCostCoeff0[ 32 * 32 ];
    1530   ::memset( pdCostCoeff, 0, sizeof(Double) *  uiMaxNumCoeff );
    1531   ::memset( pdCostSig,   0, sizeof(Double) *  uiMaxNumCoeff );
    1532   Int rateIncUp   [ 32 * 32 ];
    1533   Int rateIncDown [ 32 * 32 ];
    1534   Int sigRateDelta[ 32 * 32 ];
    1535   Int deltaU      [ 32 * 32 ];
    1536   ::memset( rateIncUp,    0, sizeof(Int) *  uiMaxNumCoeff );
    1537   ::memset( rateIncDown,  0, sizeof(Int) *  uiMaxNumCoeff );
    1538   ::memset( sigRateDelta, 0, sizeof(Int) *  uiMaxNumCoeff );
    1539   ::memset( deltaU,       0, sizeof(Int) *  uiMaxNumCoeff );
    1540  
    1541   const UInt * scanCG;
    1542   {
    1543     scanCG = g_auiSigLastScan[ uiScanIdx ][ uiLog2BlkSize > 3 ? uiLog2BlkSize-2-1 : 0  ];
    1544     if( uiLog2BlkSize == 3 )
    1545     {
    1546       scanCG = g_sigLastScan8x8[ uiScanIdx ];
    1547     }
    1548     else if( uiLog2BlkSize == 5 )
    1549     {
    1550       scanCG = g_sigLastScanCG32x32;
    1551     }
    1552   }
    1553   const UInt uiCGSize = (1 << MLS_CG_SIZE);         // 16
     2209
     2210  TUEntropyCodingParameters codingParameters;
     2211  getTUEntropyCodingParameters(codingParameters, rTu, compID);
     2212  const UInt uiCGSize = (1 << MLS_CG_SIZE);
     2213
    15542214  Double pdCostCoeffGroupSig[ MLS_GRP_NUM ];
    15552215  UInt uiSigCoeffGroupFlag[ MLS_GRP_NUM ];
    1556   UInt uiNumBlkSide = uiWidth / MLS_CG_SIZE;
    15572216  Int iCGLastScanPos = -1;
    1558  
     2217
    15592218  UInt    uiCtxSet            = 0;
    15602219  Int     c1                  = 1;
     
    15622221  Double  d64BaseCost         = 0;
    15632222  Int     iLastScanPos        = -1;
    1564  
     2223
    15652224  UInt    c1Idx     = 0;
    15662225  UInt    c2Idx     = 0;
    15672226  Int     baseLevel;
    1568  
    1569   const UInt *scan = g_auiSigLastScan[ uiScanIdx ][ uiLog2BlkSize - 1 ];
    1570  
    1571   ::memset( pdCostCoeffGroupSig,   0, sizeof(Double) * MLS_GRP_NUM );
    1572   ::memset( uiSigCoeffGroupFlag,   0, sizeof(UInt) * MLS_GRP_NUM );
    1573  
     2227
     2228  memset( pdCostCoeffGroupSig,   0, sizeof(Double) * MLS_GRP_NUM );
     2229  memset( uiSigCoeffGroupFlag,   0, sizeof(UInt) * MLS_GRP_NUM );
     2230
    15742231  UInt uiCGNum = uiWidth * uiHeight >> MLS_CG_SIZE;
    15752232  Int iScanPos;
    1576   coeffGroupRDStats rdStats;     
    1577  
     2233  coeffGroupRDStats rdStats;
     2234
     2235  const UInt significanceMapContextOffset = getSignificanceMapContextOffset(compID);
     2236
    15782237  for (Int iCGScanPos = uiCGNum-1; iCGScanPos >= 0; iCGScanPos--)
    15792238  {
    1580     UInt uiCGBlkPos = scanCG[ iCGScanPos ];
    1581     UInt uiCGPosY   = uiCGBlkPos / uiNumBlkSide;
    1582     UInt uiCGPosX   = uiCGBlkPos - (uiCGPosY * uiNumBlkSide);
    1583     ::memset( &rdStats, 0, sizeof (coeffGroupRDStats));
    1584    
    1585     const Int patternSigCtx = TComTrQuant::calcPatternSigCtx(uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, uiWidth, uiHeight);
     2239    UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
     2240    UInt uiCGPosY   = uiCGBlkPos / codingParameters.widthInGroups;
     2241    UInt uiCGPosX   = uiCGBlkPos - (uiCGPosY * codingParameters.widthInGroups);
     2242
     2243    memset( &rdStats, 0, sizeof (coeffGroupRDStats));
     2244
     2245    const Int patternSigCtx = TComTrQuant::calcPatternSigCtx(uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups);
     2246
    15862247    for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
    15872248    {
    15882249      iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
    15892250      //===== quantization =====
    1590       UInt    uiBlkPos          = scan[iScanPos];
     2251      UInt    uiBlkPos          = codingParameters.scan[iScanPos];
    15912252      // set coeff
    1592       Int uiQ  = piQCoef[uiBlkPos];
    1593       Double dTemp = pdErrScale[uiBlkPos];
    1594       Int lLevelDouble          = plSrcCoeff[ uiBlkPos ];
    1595       lLevelDouble              = (Int)min<Int64>((Int64)abs((Int)lLevelDouble) * uiQ , MAX_INT - (1 << (iQBits - 1)));
     2253
     2254      const Int    quantisationCoefficient = (enableScalingLists) ? piQCoef   [uiBlkPos] : defaultQuantisationCoefficient;
     2255      const Double errorScale              = (enableScalingLists) ? pdErrScale[uiBlkPos] : defaultErrorScale;
     2256
     2257      const Int64  tmpLevel                = Int64(abs(plSrcCoeff[ uiBlkPos ])) * quantisationCoefficient;
     2258
     2259      const Intermediate_Int lLevelDouble  = (Intermediate_Int)min<Int64>(tmpLevel, std::numeric_limits<Intermediate_Int>::max() - (Intermediate_Int(1) << (iQBits - 1)));
     2260
    15962261#if ADAPTIVE_QP_SELECTION
    15972262      if( m_bUseAdaptQpSelect )
    15982263      {
    1599         piArlDstCoeff[uiBlkPos]   = (Int)(( lLevelDouble + iAddC) >> iQBitsC );
    1600       }
    1601 #endif
    1602       UInt uiMaxAbsLevel        = (lLevelDouble + (1 << (iQBits - 1))) >> iQBits;
    1603      
    1604       Double dErr               = Double( lLevelDouble );
    1605       pdCostCoeff0[ iScanPos ]  = dErr * dErr * dTemp;
     2264        piArlDstCoeff[uiBlkPos]   = (TCoeff)(( lLevelDouble + iAddC) >> iQBitsC );
     2265      }
     2266#endif
     2267      const UInt uiMaxAbsLevel  = std::min<UInt>(UInt(entropyCodingMaximum), UInt((lLevelDouble + (Intermediate_Int(1) << (iQBits - 1))) >> iQBits));
     2268
     2269      const Double dErr         = Double( lLevelDouble );
     2270      pdCostCoeff0[ iScanPos ]  = dErr * dErr * errorScale;
    16062271      d64BlockUncodedCost      += pdCostCoeff0[ iScanPos ];
    16072272      piDstCoeff[ uiBlkPos ]    = uiMaxAbsLevel;
    1608      
     2273
    16092274      if ( uiMaxAbsLevel > 0 && iLastScanPos < 0 )
    16102275      {
    16112276        iLastScanPos            = iScanPos;
    1612         uiCtxSet                = (iScanPos < SCAN_SET_SIZE || eTType!=TEXT_LUMA) ? 0 : 2;
     2277        uiCtxSet                = getContextSetIndex(compID, (iScanPos >> MLS_CG_SIZE), 0);
    16132278        iCGLastScanPos          = iCGScanPos;
    16142279      }
    1615      
     2280
    16162281      if ( iLastScanPos >= 0 )
    16172282      {
    16182283        //===== coefficient level estimation =====
    16192284        UInt  uiLevel;
    1620         UInt  uiOneCtx         = 4 * uiCtxSet + c1;
    1621         UInt  uiAbsCtx         = uiCtxSet + c2;
    1622        
     2285        UInt  uiOneCtx         = (NUM_ONE_FLAG_CTX_PER_SET * uiCtxSet) + c1;
     2286        UInt  uiAbsCtx         = (NUM_ABS_FLAG_CTX_PER_SET * uiCtxSet) + c2;
     2287
    16232288        if( iScanPos == iLastScanPos )
    16242289        {
    1625           uiLevel              = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
    1626                                                 lLevelDouble, uiMaxAbsLevel, 0, uiOneCtx, uiAbsCtx, uiGoRiceParam,
    1627                                                 c1Idx, c2Idx, iQBits, dTemp, 1 );
     2290          uiLevel              = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
     2291                                                  lLevelDouble, uiMaxAbsLevel, significanceMapContextOffset, uiOneCtx, uiAbsCtx, uiGoRiceParam,
     2292                                                  c1Idx, c2Idx, iQBits, errorScale, 1, extendedPrecision, maxLog2TrDynamicRange
     2293                                                  );
    16282294        }
    16292295        else
    16302296        {
    1631           UInt   uiPosY        = uiBlkPos >> uiLog2BlkSize;
    1632           UInt   uiPosX        = uiBlkPos - ( uiPosY << uiLog2BlkSize );
    1633           UShort uiCtxSig      = getSigCtxInc( patternSigCtx, uiScanIdx, uiPosX, uiPosY, uiLog2BlkSize, eTType );
     2297          UShort uiCtxSig      = significanceMapContextOffset + getSigCtxInc( patternSigCtx, codingParameters, iScanPos, uiLog2BlockWidth, uiLog2BlockHeight, channelType );
     2298
    16342299          uiLevel              = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
    1635                                                 lLevelDouble, uiMaxAbsLevel, uiCtxSig, uiOneCtx, uiAbsCtx, uiGoRiceParam,
    1636                                                 c1Idx, c2Idx, iQBits, dTemp, 0 );
     2300                                                  lLevelDouble, uiMaxAbsLevel, uiCtxSig, uiOneCtx, uiAbsCtx, uiGoRiceParam,
     2301                                                  c1Idx, c2Idx, iQBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange
     2302                                                  );
     2303
    16372304          sigRateDelta[ uiBlkPos ] = m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 1 ] - m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 0 ];
    16382305        }
    1639         deltaU[ uiBlkPos ]        = (lLevelDouble - ((Int)uiLevel << iQBits)) >> (iQBits-8);
     2306
     2307        deltaU[ uiBlkPos ]        = TCoeff((lLevelDouble - (Intermediate_Int(uiLevel) << iQBits)) >> (iQBits-8));
     2308
    16402309        if( uiLevel > 0 )
    16412310        {
    1642           Int rateNow = xGetICRate( uiLevel, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx );
    1643           rateIncUp   [ uiBlkPos ] = xGetICRate( uiLevel+1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx ) - rateNow;
    1644           rateIncDown [ uiBlkPos ] = xGetICRate( uiLevel-1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx ) - rateNow;
     2311          Int rateNow = xGetICRate( uiLevel, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange );
     2312          rateIncUp   [ uiBlkPos ] = xGetICRate( uiLevel+1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
     2313          rateIncDown [ uiBlkPos ] = xGetICRate( uiLevel-1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
    16452314        }
    16462315        else // uiLevel == 0
     
    16502319        piDstCoeff[ uiBlkPos ] = uiLevel;
    16512320        d64BaseCost           += pdCostCoeff [ iScanPos ];
    1652        
    1653        
     2321
    16542322        baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
    16552323        if( uiLevel >= baseLevel )
    16562324        {
    1657           if(uiLevel > 3*(1<<uiGoRiceParam))
     2325          if (uiLevel > 3*(1<<uiGoRiceParam))
    16582326          {
    1659             uiGoRiceParam = min<UInt>(uiGoRiceParam+ 1, 4);
     2327            uiGoRiceParam = bUseGolombRiceParameterAdaptation ? (uiGoRiceParam + 1) : (std::min<UInt>((uiGoRiceParam + 1), 4));
    16602328          }
    16612329        }
     
    16642332          c1Idx ++;
    16652333        }
    1666        
     2334
    16672335        //===== update bin model =====
    16682336        if( uiLevel > 1 )
    16692337        {
    1670           c1 = 0; 
     2338          c1 = 0;
    16712339          c2 += (c2 < 2);
    16722340          c2Idx ++;
     
    16762344          c1++;
    16772345        }
    1678        
     2346
    16792347        //===== context set update =====
    1680         if( ( iScanPos % SCAN_SET_SIZE == 0 ) && ( iScanPos > 0 ) )
    1681         {
     2348        if( ( iScanPos % uiCGSize == 0 ) && ( iScanPos > 0 ) )
     2349        {
     2350          uiCtxSet          = getContextSetIndex(compID, ((iScanPos - 1) >> MLS_CG_SIZE), (c1 == 0)); //(iScanPos - 1) because we do this **before** entering the final group
     2351          c1                = 1;
    16822352          c2                = 0;
    1683           uiGoRiceParam     = 0;
    1684          
    1685           c1Idx   = 0;
    1686           c2Idx   = 0;
    1687           uiCtxSet          = (iScanPos == SCAN_SET_SIZE || eTType!=TEXT_LUMA) ? 0 : 2;
    1688           if( c1 == 0 )
    1689           {
    1690             uiCtxSet++;
    1691           }
    1692           c1 = 1;
     2353          c1Idx             = 0;
     2354          c2Idx             = 0;
     2355          uiGoRiceParam     = initialGolombRiceParameter;
    16932356        }
    16942357      }
     
    17132376      }
    17142377    } //end for (iScanPosinCG)
    1715    
    1716     if (iCGLastScanPos >= 0) 
     2378
     2379    if (iCGLastScanPos >= 0)
    17172380    {
    17182381      if( iCGScanPos )
     
    17202383        if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
    17212384        {
    1722           UInt  uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, uiWidth, uiHeight);
    1723           d64BaseCost += xGetRateSigCoeffGroup(0, uiCtxSig) - rdStats.d64SigCost;; 
    1724           pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig); 
    1725         } 
     2385          UInt  uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
     2386          d64BaseCost += xGetRateSigCoeffGroup(0, uiCtxSig) - rdStats.d64SigCost;;
     2387          pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
     2388        }
    17262389        else
    17272390        {
    17282391          if (iCGScanPos < iCGLastScanPos) //skip the last coefficient group, which will be handled together with last position below.
    17292392          {
    1730             if ( rdStats.iNNZbeforePos0 == 0 ) 
     2393            if ( rdStats.iNNZbeforePos0 == 0 )
    17312394            {
    17322395              d64BaseCost -= rdStats.d64SigCost_0;
     
    17352398            // rd-cost if SigCoeffGroupFlag = 0, initialization
    17362399            Double d64CostZeroCG = d64BaseCost;
    1737            
     2400
    17382401            // add SigCoeffGroupFlag cost to total cost
    1739             UInt  uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, uiWidth, uiHeight);
     2402            UInt  uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
     2403
    17402404            if (iCGScanPos < iCGLastScanPos)
    17412405            {
    1742               d64BaseCost  += xGetRateSigCoeffGroup(1, uiCtxSig); 
    1743               d64CostZeroCG += xGetRateSigCoeffGroup(0, uiCtxSig); 
    1744               pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(1, uiCtxSig); 
     2406              d64BaseCost  += xGetRateSigCoeffGroup(1, uiCtxSig);
     2407              d64CostZeroCG += xGetRateSigCoeffGroup(0, uiCtxSig);
     2408              pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(1, uiCtxSig);
    17452409            }
    1746            
     2410
    17472411            // try to convert the current coeff group from non-zero to all-zero
    17482412            d64CostZeroCG += rdStats.d64UncodedDist;  // distortion for resetting non-zero levels to zero levels
    17492413            d64CostZeroCG -= rdStats.d64CodedLevelandDist;   // distortion and level cost for keeping all non-zero levels
    17502414            d64CostZeroCG -= rdStats.d64SigCost;     // sig cost for all coeffs, including zero levels and non-zerl levels
    1751            
     2415
    17522416            // if we can save cost, change this block to all-zero block
    1753             if ( d64CostZeroCG < d64BaseCost )     
     2417            if ( d64CostZeroCG < d64BaseCost )
    17542418            {
    17552419              uiSigCoeffGroupFlag[ uiCGBlkPos ] = 0;
     
    17572421              if (iCGScanPos < iCGLastScanPos)
    17582422              {
    1759                 pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig); 
     2423                pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
    17602424              }
    1761               // reset coeffs to 0 in this block               
     2425              // reset coeffs to 0 in this block
    17622426              for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
    17632427              {
    17642428                iScanPos      = iCGScanPos*uiCGSize + iScanPosinCG;
    1765                 UInt uiBlkPos = scan[ iScanPos ];
    1766                
     2429                UInt uiBlkPos = codingParameters.scan[ iScanPos ];
     2430
    17672431                if (piDstCoeff[ uiBlkPos ])
    17682432                {
     
    17722436                }
    17732437              }
    1774             } // end if ( d64CostAllZeros < d64BaseCost )     
     2438            } // end if ( d64CostAllZeros < d64BaseCost )
    17752439          }
    17762440        } // end if if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
     
    17822446    }
    17832447  } //end for (iCGScanPos)
    1784  
     2448
    17852449  //===== estimate last position =====
    17862450  if ( iLastScanPos < 0 )
     
    17882452    return;
    17892453  }
    1790  
     2454
    17912455  Double  d64BestCost         = 0;
    17922456  Int     ui16CtxCbf          = 0;
    17932457  Int     iBestLastIdxP1      = 0;
    1794   if( !pcCU->isIntra( uiAbsPartIdx ) && eTType == TEXT_LUMA && pcCU->getTransformIdx( uiAbsPartIdx ) == 0 )
     2458  if( !pcCU->isIntra( uiAbsPartIdx ) && isLuma(compID) && pcCU->getTransformIdx( uiAbsPartIdx ) == 0 )
    17952459  {
    17962460    ui16CtxCbf   = 0;
     
    18002464  else
    18012465  {
    1802     ui16CtxCbf   = pcCU->getCtxQtCbf( eTType, pcCU->getTransformIdx( uiAbsPartIdx ) );
    1803     ui16CtxCbf   = ( eTType ? TEXT_CHROMA : eTType ) * NUM_QT_CBF_CTX + ui16CtxCbf;
     2466    ui16CtxCbf   = pcCU->getCtxQtCbf( rTu, channelType );
     2467    ui16CtxCbf  += getCBFContextOffset(compID);
    18042468    d64BestCost  = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 0 ] );
    18052469    d64BaseCost += xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 1 ] );
    18062470  }
    1807  
     2471
     2472
    18082473  Bool bFoundLast = false;
    18092474  for (Int iCGScanPos = iCGLastScanPos; iCGScanPos >= 0; iCGScanPos--)
    18102475  {
    1811     UInt uiCGBlkPos = scanCG[ iCGScanPos ];
    1812    
    1813     d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ]; 
     2476    UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
     2477
     2478    d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ];
    18142479    if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
    1815     {     
     2480    {
    18162481      for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
    18172482      {
    18182483        iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
    1819         if (iScanPos > iLastScanPos) continue;
    1820         UInt   uiBlkPos     = scan[iScanPos];
    1821        
     2484
     2485        if (iScanPos > iLastScanPos)
     2486        {
     2487          continue;
     2488        }
     2489        UInt   uiBlkPos     = codingParameters.scan[iScanPos];
     2490
    18222491        if( piDstCoeff[ uiBlkPos ] )
    18232492        {
    1824           UInt   uiPosY       = uiBlkPos >> uiLog2BlkSize;
    1825           UInt   uiPosX       = uiBlkPos - ( uiPosY << uiLog2BlkSize );
    1826          
    1827           Double d64CostLast= uiScanIdx == SCAN_VER ? xGetRateLast( uiPosY, uiPosX ) : xGetRateLast( uiPosX, uiPosY );
     2493          UInt   uiPosY       = uiBlkPos >> uiLog2BlockWidth;
     2494          UInt   uiPosX       = uiBlkPos - ( uiPosY << uiLog2BlockWidth );
     2495
     2496          Double d64CostLast= codingParameters.scanType == SCAN_VER ? xGetRateLast( uiPosY, uiPosX, compID ) : xGetRateLast( uiPosX, uiPosY, compID );
    18282497          Double totalCost = d64BaseCost + d64CostLast - pdCostSig[ iScanPos ];
    1829          
     2498
    18302499          if( totalCost < d64BestCost )
    18312500          {
     
    18452514          d64BaseCost      -= pdCostSig[ iScanPos ];
    18462515        }
    1847       } //end for 
     2516      } //end for
    18482517      if (bFoundLast)
    18492518      {
     
    18512520      }
    18522521    } // end if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
    1853   } // end for
    1854  
     2522  } // end for
     2523
     2524
    18552525  for ( Int scanPos = 0; scanPos < iBestLastIdxP1; scanPos++ )
    18562526  {
    1857     Int blkPos = scan[ scanPos ];
    1858     Int level = piDstCoeff[ blkPos ];
     2527    Int blkPos = codingParameters.scan[ scanPos ];
     2528    TCoeff level = piDstCoeff[ blkPos ];
    18592529    uiAbsSum += level;
    18602530    piDstCoeff[ blkPos ] = ( plSrcCoeff[ blkPos ] < 0 ) ? -level : level;
    18612531  }
    1862  
     2532
    18632533  //===== clean uncoded coefficients =====
    18642534  for ( Int scanPos = iBestLastIdxP1; scanPos <= iLastScanPos; scanPos++ )
    18652535  {
    1866     piDstCoeff[ scan[ scanPos ] ] = 0;
    1867   }
    1868  
     2536    piDstCoeff[ codingParameters.scan[ scanPos ] ] = 0;
     2537  }
     2538
     2539
    18692540  if( pcCU->getSlice()->getPPS()->getSignHideFlag() && uiAbsSum>=2)
    18702541  {
    1871     Int64 rdFactor = (Int64) (
    1872                      g_invQuantScales[m_cQP.rem()] * g_invQuantScales[m_cQP.rem()] * (1<<(2*m_cQP.m_iPer))
    1873                    / m_dLambda / 16 / (1<<DISTORTION_PRECISION_ADJUSTMENT(2*(uiBitDepth-8)))
    1874                    + 0.5);
     2542    const Double inverseQuantScale = Double(g_invQuantScales[cQP.rem]);
     2543    Int64 rdFactor = (Int64)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per))
     2544                             / m_dLambda / 16 / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth - 8)))
     2545                             + 0.5);
     2546
    18752547    Int lastCG = -1;
    18762548    Int absSum = 0 ;
    18772549    Int n ;
    1878    
    1879     for( Int subSet = (uiWidth*uiHeight-1) >> LOG2_SCAN_SET_SIZE; subSet >= 0; subSet-- )
    1880     {
    1881       Int  subPos     = subSet << LOG2_SCAN_SET_SIZE;
    1882       Int  firstNZPosInCG=SCAN_SET_SIZE , lastNZPosInCG=-1 ;
     2550
     2551    for( Int subSet = (uiWidth*uiHeight-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
     2552    {
     2553      Int  subPos     = subSet << MLS_CG_SIZE;
     2554      Int  firstNZPosInCG=uiCGSize , lastNZPosInCG=-1 ;
    18832555      absSum = 0 ;
    1884      
    1885       for(n = SCAN_SET_SIZE-1; n >= 0; --n )
    1886       {
    1887         if( piDstCoeff[ scan[ n + subPos ]] )
     2556
     2557      for(n = uiCGSize-1; n >= 0; --n )
     2558      {
     2559        if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
    18882560        {
    18892561          lastNZPosInCG = n;
     
    18912563        }
    18922564      }
    1893      
    1894       for(n = 0; n <SCAN_SET_SIZE; n++ )
    1895       {
    1896         if( piDstCoeff[ scan[ n + subPos ]] )
     2565
     2566      for(n = 0; n <uiCGSize; n++ )
     2567      {
     2568        if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
    18972569        {
    18982570          firstNZPosInCG = n;
     
    19002572        }
    19012573      }
    1902      
     2574
    19032575      for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
    19042576      {
    1905         absSum += piDstCoeff[ scan[ n + subPos ]];
    1906       }
    1907      
     2577        absSum += Int(piDstCoeff[ codingParameters.scan[ n + subPos ]]);
     2578      }
     2579
    19082580      if(lastNZPosInCG>=0 && lastCG==-1)
    19092581      {
    1910         lastCG = 1; 
    1911       } 
    1912      
     2582        lastCG = 1;
     2583      }
     2584
    19132585      if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
    19142586      {
    1915         UInt signbit = (piDstCoeff[scan[subPos+firstNZPosInCG]]>0?0:1);
     2587        UInt signbit = (piDstCoeff[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1);
    19162588        if( signbit!=(absSum&0x1) )  // hide but need tune
    19172589        {
    1918           // calculate the cost 
    1919           Int64 minCostInc = MAX_INT64, curCost=MAX_INT64;
    1920           Int minPos =-1, finalChange=0, curChange=0;
    1921          
    1922           for( n = (lastCG==1?lastNZPosInCG:SCAN_SET_SIZE-1) ; n >= 0; --n )
     2590          // calculate the cost
     2591          Int64 minCostInc = std::numeric_limits<Int64>::max(), curCost = std::numeric_limits<Int64>::max();
     2592          Int minPos = -1, finalChange = 0, curChange = 0;
     2593
     2594          for( n = (lastCG==1?lastNZPosInCG:uiCGSize-1) ; n >= 0; --n )
    19232595          {
    1924             UInt uiBlkPos   = scan[ n + subPos ];
     2596            UInt uiBlkPos   = codingParameters.scan[ n + subPos ];
    19252597            if(piDstCoeff[ uiBlkPos ] != 0 )
    19262598            {
    1927               Int64 costUp   = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos] ;
    1928               Int64 costDown = rdFactor * (   deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos] 
    1929               -   ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);
    1930              
     2599              Int64 costUp   = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos];
     2600              Int64 costDown = rdFactor * (   deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos]
     2601                               -   ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);
     2602
    19312603              if(lastCG==1 && lastNZPosInCG==n && abs(piDstCoeff[uiBlkPos])==1)
    19322604              {
    1933                 costDown -= (4<<15) ;
     2605                costDown -= (4<<15);
    19342606              }
    1935              
     2607
    19362608              if(costUp<costDown)
    1937               { 
     2609              {
    19382610                curCost = costUp;
    1939                 curChange =  1 ;
     2611                curChange =  1;
    19402612              }
    1941               else               
     2613              else
    19422614              {
    1943                 curChange = -1 ;
     2615                curChange = -1;
    19442616                if(n==firstNZPosInCG && abs(piDstCoeff[uiBlkPos])==1)
    19452617                {
    1946                   curCost = MAX_INT64 ;
     2618                  curCost = std::numeric_limits<Int64>::max();
    19472619                }
    19482620                else
    19492621                {
    1950                   curCost = costDown ;
     2622                  curCost = costDown;
    19512623                }
    19522624              }
     
    19542626            else
    19552627            {
    1956               curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<15) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ; 
     2628              curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<15) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ;
    19572629              curChange = 1 ;
    1958              
     2630
    19592631              if(n<firstNZPosInCG)
    19602632              {
     
    19622634                if(thissignbit != signbit )
    19632635                {
    1964                   curCost = MAX_INT64;
     2636                  curCost = std::numeric_limits<Int64>::max();
    19652637                }
    19662638              }
    19672639            }
    1968            
     2640
    19692641            if( curCost<minCostInc)
    19702642            {
    1971               minCostInc = curCost ;
    1972               finalChange = curChange ;
    1973               minPos = uiBlkPos ;
     2643              minCostInc = curCost;
     2644              finalChange = curChange;
     2645              minPos = uiBlkPos;
    19742646            }
    19752647          }
    1976          
    1977           if(piDstCoeff[minPos] == 32767 || piDstCoeff[minPos] == -32768)
     2648
     2649          if(piDstCoeff[minPos] == entropyCodingMaximum || piDstCoeff[minPos] == entropyCodingMinimum)
    19782650          {
    19792651            finalChange = -1;
    19802652          }
    1981          
     2653
    19822654          if(plSrcCoeff[minPos]>=0)
    19832655          {
     
    19862658          else
    19872659          {
    1988             piDstCoeff[minPos] -= finalChange ; 
    1989           }         
    1990         }
    1991       }
    1992      
     2660            piDstCoeff[minPos] -= finalChange ;
     2661          }
     2662        }
     2663      }
     2664
    19932665      if(lastCG==1)
    19942666      {
    1995         lastCG=0 ; 
    1996       }
    1997     }
    1998   }
    1999 }
     2667        lastCG=0 ;
     2668      }
     2669    }
     2670  }
     2671}
     2672
    20002673
    20012674/** Pattern decision for context derivation process of significant_coeff_flag
    20022675 * \param sigCoeffGroupFlag pointer to prior coded significant coeff group
    2003  * \param posXCG column of current coefficient group
    2004  * \param posYCG row of current coefficient group
    2005  * \param width width of the block
    2006  * \param height height of the block
     2676 * \param uiCGPosX column of current coefficient group
     2677 * \param uiCGPosY row of current coefficient group
     2678 * \param widthInGroups width of the block
     2679 * \param heightInGroups height of the block
    20072680 * \returns pattern for current coefficient group
    20082681 */
    2009 Int  TComTrQuant::calcPatternSigCtx( const UInt* sigCoeffGroupFlag, UInt posXCG, UInt posYCG, Int width, Int height )
    2010 {
    2011   if( width == 4 && height == 4 ) return -1;
     2682Int  TComTrQuant::calcPatternSigCtx( const UInt* sigCoeffGroupFlag, UInt uiCGPosX, UInt uiCGPosY, UInt widthInGroups, UInt heightInGroups )
     2683{
     2684  if ((widthInGroups <= 1) && (heightInGroups <= 1))
     2685  {
     2686    return 0;
     2687  }
     2688
     2689  const Bool rightAvailable = uiCGPosX < (widthInGroups  - 1);
     2690  const Bool belowAvailable = uiCGPosY < (heightInGroups - 1);
    20122691
    20132692  UInt sigRight = 0;
    20142693  UInt sigLower = 0;
    20152694
    2016   width >>= 2;
    2017   height >>= 2;
    2018   if( posXCG < width - 1 )
    2019   {
    2020     sigRight = (sigCoeffGroupFlag[ posYCG * width + posXCG + 1 ] != 0);
    2021   }
    2022   if (posYCG < height - 1 )
    2023   {
    2024     sigLower = (sigCoeffGroupFlag[ (posYCG  + 1 ) * width + posXCG ] != 0);
    2025   }
    2026   return sigRight + (sigLower<<1);
    2027 }
     2695  if (rightAvailable)
     2696  {
     2697    sigRight = ((sigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
     2698  }
     2699  if (belowAvailable)
     2700  {
     2701    sigLower = ((sigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
     2702  }
     2703
     2704  return sigRight + (sigLower << 1);
     2705}
     2706
    20282707
    20292708/** Context derivation process of coeff_abs_significant_flag
    20302709 * \param patternSigCtx pattern for current coefficient group
    2031  * \param posX column of current scan position
    2032  * \param posY row of current scan position
    2033  * \param log2BlockSize log2 value of block size (square block)
    2034  * \param width width of the block
    2035  * \param height height of the block
    2036  * \param textureType texture type (TEXT_LUMA...)
     2710 * \param codingParameters coding parameters for the TU (includes the scan)
     2711 * \param scanPosition current position in scan order
     2712 * \param log2BlockWidth log2 width of the block
     2713 * \param log2BlockHeight log2 height of the block
     2714 * \param chanType channel type (CHANNEL_TYPE_LUMA/CHROMA)
    20372715 * \returns ctxInc for current scan position
    20382716 */
    2039 Int TComTrQuant::getSigCtxInc    (
    2040                                    Int                             patternSigCtx,
    2041                                    UInt                            scanIdx,
    2042                                    Int                             posX,
    2043                                    Int                             posY,
    2044                                    Int                             log2BlockSize,
    2045                                    TextType                        textureType
    2046                                   )
    2047 {
    2048   const Int ctxIndMap[16] =
    2049   {
    2050     0, 1, 4, 5,
    2051     2, 3, 4, 5,
    2052     6, 6, 8, 8,
    2053     7, 7, 8, 8
    2054   };
    2055 
    2056   if( posX + posY == 0 )
    2057   {
    2058     return 0;
    2059   }
    2060 
    2061   if ( log2BlockSize == 2 )
    2062   {
    2063     return ctxIndMap[ 4 * posY + posX ];
    2064   }
    2065 
    2066   Int offset = log2BlockSize == 3 ? (scanIdx==SCAN_DIAG ? 9 : 15) : (textureType == TEXT_LUMA ? 21 : 12);
    2067 
    2068   Int posXinSubset = posX-((posX>>2)<<2);
    2069   Int posYinSubset = posY-((posY>>2)<<2);
    2070   Int cnt = 0;
    2071   if(patternSigCtx==0)
    2072   {
    2073     cnt = posXinSubset+posYinSubset<=2 ? (posXinSubset+posYinSubset==0 ? 2 : 1) : 0;
    2074   }
    2075   else if(patternSigCtx==1)
    2076   {
    2077     cnt = posYinSubset<=1 ? (posYinSubset==0 ? 2 : 1) : 0;
    2078   }
    2079   else if(patternSigCtx==2)
    2080   {
    2081     cnt = posXinSubset<=1 ? (posXinSubset==0 ? 2 : 1) : 0;
     2717Int TComTrQuant::getSigCtxInc    (       Int                        patternSigCtx,
     2718                                   const TUEntropyCodingParameters &codingParameters,
     2719                                   const Int                        scanPosition,
     2720                                   const Int                        log2BlockWidth,
     2721                                   const Int                        log2BlockHeight,
     2722                                   const ChannelType                chanType)
     2723{
     2724  if (codingParameters.firstSignificanceMapContext == significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE])
     2725  {
     2726    //single context mode
     2727    return significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE];
     2728  }
     2729
     2730  const UInt rasterPosition = codingParameters.scan[scanPosition];
     2731  const UInt posY           = rasterPosition >> log2BlockWidth;
     2732  const UInt posX           = rasterPosition - (posY << log2BlockWidth);
     2733
     2734  if ((posX + posY) == 0)
     2735  {
     2736    return 0; //special case for the DC context variable
     2737  }
     2738
     2739  Int offset = MAX_INT;
     2740
     2741  if ((log2BlockWidth == 2) && (log2BlockHeight == 2)) //4x4
     2742  {
     2743    offset = ctxIndMap4x4[ (4 * posY) + posX ];
    20822744  }
    20832745  else
    20842746  {
    2085     cnt = 2;
    2086   }
    2087 
    2088   return (( textureType == TEXT_LUMA && ((posX>>2) + (posY>>2)) > 0 ) ? 3 : 0) + offset + cnt;
    2089 }
     2747    Int cnt = 0;
     2748
     2749    switch (patternSigCtx)
     2750    {
     2751      //------------------
     2752
     2753      case 0: //neither neighbouring group is significant
     2754        {
     2755          const Int posXinSubset     = posX & ((1 << MLS_CG_LOG2_WIDTH)  - 1);
     2756          const Int posYinSubset     = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
     2757          const Int posTotalInSubset = posXinSubset + posYinSubset;
     2758
     2759          //first N coefficients in scan order use 2; the next few use 1; the rest use 0.
     2760          const UInt context1Threshold = NEIGHBOURHOOD_00_CONTEXT_1_THRESHOLD_4x4;
     2761          const UInt context2Threshold = NEIGHBOURHOOD_00_CONTEXT_2_THRESHOLD_4x4;
     2762
     2763          cnt = (posTotalInSubset >= context1Threshold) ? 0 : ((posTotalInSubset >= context2Threshold) ? 1 : 2);
     2764        }
     2765        break;
     2766
     2767      //------------------
     2768
     2769      case 1: //right group is significant, below is not
     2770        {
     2771          const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
     2772          const Int groupHeight  = 1 << MLS_CG_LOG2_HEIGHT;
     2773
     2774          cnt = (posYinSubset >= (groupHeight >> 1)) ? 0 : ((posYinSubset >= (groupHeight >> 2)) ? 1 : 2); //top quarter uses 2; second-from-top quarter uses 1; bottom half uses 0
     2775        }
     2776        break;
     2777
     2778      //------------------
     2779
     2780      case 2: //below group is significant, right is not
     2781        {
     2782          const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH)  - 1);
     2783          const Int groupWidth   = 1 << MLS_CG_LOG2_WIDTH;
     2784
     2785          cnt = (posXinSubset >= (groupWidth >> 1)) ? 0 : ((posXinSubset >= (groupWidth >> 2)) ? 1 : 2); //left quarter uses 2; second-from-left quarter uses 1; right half uses 0
     2786        }
     2787        break;
     2788
     2789      //------------------
     2790
     2791      case 3: //both neighbouring groups are significant
     2792        {
     2793          cnt = 2;
     2794        }
     2795        break;
     2796
     2797      //------------------
     2798
     2799      default:
     2800        std::cerr << "ERROR: Invalid patternSigCtx \"" << Int(patternSigCtx) << "\" in getSigCtxInc" << std::endl;
     2801        exit(1);
     2802        break;
     2803    }
     2804
     2805    //------------------------------------------------
     2806
     2807    const Bool notFirstGroup = ((posX >> MLS_CG_LOG2_WIDTH) + (posY >> MLS_CG_LOG2_HEIGHT)) > 0;
     2808
     2809    offset = (notFirstGroup ? notFirstGroupNeighbourhoodContextOffset[chanType] : 0) + cnt;
     2810  }
     2811
     2812  return codingParameters.firstSignificanceMapContext + offset;
     2813}
     2814
    20902815
    20912816/** Get the best level in RD sense
    2092  * \param rd64CodedCost reference to coded cost
    2093  * \param rd64CodedCost0 reference to cost when coefficient is 0
    2094  * \param rd64CodedCostSig reference to cost of significant coefficient
    2095  * \param lLevelDouble reference to unscaled quantized level
    2096  * \param uiMaxAbsLevel scaled quantized level
    2097  * \param ui16CtxNumSig current ctxInc for coeff_abs_significant_flag
    2098  * \param ui16CtxNumOne current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
    2099  * \param ui16CtxNumAbs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
    2100  * \param ui16AbsGoRice current Rice parameter for coeff_abs_level_minus3
    2101  * \param iQBits quantization step size
    2102  * \param dTemp correction factor
    2103  * \param bLast indicates if the coefficient is the last significant
     2817 *
    21042818 * \returns best quantized transform level for given scan position
     2819 *
    21052820 * This method calculates the best quantized transform level for a given scan position.
    21062821 */
    2107 __inline UInt TComTrQuant::xGetCodedLevel ( Double&                         rd64CodedCost,
    2108                                             Double&                         rd64CodedCost0,
    2109                                             Double&                         rd64CodedCostSig,
    2110                                             Int                             lLevelDouble,
    2111                                             UInt                            uiMaxAbsLevel,
    2112                                             UShort                          ui16CtxNumSig,
    2113                                             UShort                          ui16CtxNumOne,
    2114                                             UShort                          ui16CtxNumAbs,
    2115                                             UShort                          ui16AbsGoRice,
    2116                                             UInt                            c1Idx,
    2117                                             UInt                            c2Idx,
    2118                                             Int                             iQBits,
    2119                                             Double                          dTemp,
    2120                                             Bool                            bLast        ) const
    2121 {
    2122   Double dCurrCostSig   = 0;
     2822__inline UInt TComTrQuant::xGetCodedLevel ( Double&          rd64CodedCost,          //< reference to coded cost
     2823                                            Double&          rd64CodedCost0,         //< reference to cost when coefficient is 0
     2824                                            Double&          rd64CodedCostSig,       //< rd64CodedCostSig reference to cost of significant coefficient
     2825                                            Intermediate_Int lLevelDouble,           //< reference to unscaled quantized level
     2826                                            UInt             uiMaxAbsLevel,          //< scaled quantized level
     2827                                            UShort           ui16CtxNumSig,          //< current ctxInc for coeff_abs_significant_flag
     2828                                            UShort           ui16CtxNumOne,          //< current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
     2829                                            UShort           ui16CtxNumAbs,          //< current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
     2830                                            UShort           ui16AbsGoRice,          //< current Rice parameter for coeff_abs_level_minus3
     2831                                            UInt             c1Idx,                  //<
     2832                                            UInt             c2Idx,                  //<
     2833                                            Int              iQBits,                 //< quantization step size
     2834                                            Double           errorScale,             //<
     2835                                            Bool             bLast,                  //< indicates if the coefficient is the last significant
     2836                                            Bool             useLimitedPrefixLength, //<
     2837                                            const Int        maxLog2TrDynamicRange   //<
     2838                                            ) const
     2839{
     2840  Double dCurrCostSig   = 0;
    21232841  UInt   uiBestAbsLevel = 0;
    2124  
     2842
    21252843  if( !bLast && uiMaxAbsLevel < 3 )
    21262844  {
    2127     rd64CodedCostSig    = xGetRateSigCoef( 0, ui16CtxNumSig ); 
     2845    rd64CodedCostSig    = xGetRateSigCoef( 0, ui16CtxNumSig );
    21282846    rd64CodedCost       = rd64CodedCost0 + rd64CodedCostSig;
    21292847    if( uiMaxAbsLevel == 0 )
     
    21452863  for( Int uiAbsLevel  = uiMaxAbsLevel; uiAbsLevel >= uiMinAbsLevel ; uiAbsLevel-- )
    21462864  {
    2147     Double dErr         = Double( lLevelDouble  - ( uiAbsLevel << iQBits ) );
    2148     Double dCurrCost    = dErr * dErr * dTemp + xGetICost(xGetICRate( uiAbsLevel, ui16CtxNumOne, ui16CtxNumAbs, ui16AbsGoRice, c1Idx, c2Idx ));
     2865    Double dErr         = Double( lLevelDouble  - ( Intermediate_Int(uiAbsLevel) << iQBits ) );
     2866    Double dCurrCost    = dErr * dErr * errorScale + xGetICost( xGetICRate( uiAbsLevel, ui16CtxNumOne, ui16CtxNumAbs, ui16AbsGoRice, c1Idx, c2Idx, useLimitedPrefixLength, maxLog2TrDynamicRange ) );
    21492867    dCurrCost          += dCurrCostSig;
    21502868
     
    21652883 * \param ui16CtxNumAbs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
    21662884 * \param ui16AbsGoRice Rice parameter for coeff_abs_level_minus3
     2885 * \param c1Idx
     2886 * \param c2Idx
     2887 * \param useLimitedPrefixLength
     2888 * \param maxLog2TrDynamicRange
    21672889 * \returns cost of given absolute transform level
    21682890 */
    2169 __inline Int TComTrQuant::xGetICRate  ( UInt                            uiAbsLevel,
    2170                                                UShort                          ui16CtxNumOne,
    2171                                                UShort                          ui16CtxNumAbs,
    2172                                                UShort                          ui16AbsGoRice
    2173                                             ,  UInt                            c1Idx,
    2174                                                UInt                            c2Idx
     2891__inline Int TComTrQuant::xGetICRate         ( const UInt    uiAbsLevel,
     2892                                               const UShort  ui16CtxNumOne,
     2893                                               const UShort  ui16CtxNumAbs,
     2894                                               const UShort  ui16AbsGoRice,
     2895                                               const UInt    c1Idx,
     2896                                               const UInt    c2Idx,
     2897                                               const Bool    useLimitedPrefixLength,
     2898                                               const Int     maxLog2TrDynamicRange
    21752899                                               ) const
    21762900{
    2177   Int iRate = Int(xGetIEPRate());
    2178   UInt baseLevel  =  (c1Idx < C1FLAG_NUMBER)? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
     2901  Int  iRate      = Int(xGetIEPRate()); // cost of sign bit
     2902  UInt baseLevel  = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
    21792903
    21802904  if ( uiAbsLevel >= baseLevel )
    2181   {   
     2905  {
    21822906    UInt symbol     = uiAbsLevel - baseLevel;
    21832907    UInt length;
     
    21872911      iRate += (length+1+ui16AbsGoRice)<< 15;
    21882912    }
     2913    else if (useLimitedPrefixLength)
     2914    {
     2915      const UInt maximumPrefixLength = (32 - (COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange));
     2916
     2917      UInt prefixLength = 0;
     2918      UInt suffix       = (symbol >> ui16AbsGoRice) - COEF_REMAIN_BIN_REDUCTION;
     2919
     2920      while ((prefixLength < maximumPrefixLength) && (suffix > ((2 << prefixLength) - 2)))
     2921      {
     2922        prefixLength++;
     2923      }
     2924
     2925      const UInt suffixLength = (prefixLength == maximumPrefixLength) ? (maxLog2TrDynamicRange - ui16AbsGoRice) : (prefixLength + 1/*separator*/);
     2926
     2927      iRate += (COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ui16AbsGoRice) << 15;
     2928    }
    21892929    else
    21902930    {
     
    21932933      while (symbol >= (1<<length))
    21942934      {
    2195         symbol -=  (1<<(length++));   
     2935        symbol -=  (1<<(length++));
    21962936      }
    21972937      iRate += (COEF_REMAIN_BIN_REDUCTION+length+1-ui16AbsGoRice+length)<< 15;
    21982938    }
     2939
    21992940    if (c1Idx < C1FLAG_NUMBER)
    22002941    {
     
    22072948    }
    22082949  }
    2209   else
    2210   if( uiAbsLevel == 1 )
     2950  else if( uiAbsLevel == 1 )
    22112951  {
    22122952    iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 0 ];
     
    22212961    iRate = 0;
    22222962  }
    2223   return iRate;
     2963
     2964  return  iRate;
    22242965}
    22252966
     
    22332974 * \param uiPosX X coordinate of the last significant coefficient
    22342975 * \param uiPosY Y coordinate of the last significant coefficient
     2976 * \param component colour component ID
    22352977 * \returns cost of last significant coefficient
    22362978 */
     
    22392981*/
    22402982__inline Double TComTrQuant::xGetRateLast   ( const UInt                      uiPosX,
    2241                                               const UInt                      uiPosY ) const
     2983                                              const UInt                      uiPosY,
     2984                                              const ComponentID               component  ) const
    22422985{
    22432986  UInt uiCtxX   = g_uiGroupIdx[uiPosX];
    22442987  UInt uiCtxY   = g_uiGroupIdx[uiPosY];
    2245   Double uiCost = m_pcEstBitsSbac->lastXBits[ uiCtxX ] + m_pcEstBitsSbac->lastYBits[ uiCtxY ];
     2988
     2989  Double uiCost = m_pcEstBitsSbac->lastXBits[toChannelType(component)][ uiCtxX ] + m_pcEstBitsSbac->lastYBits[toChannelType(component)][ uiCtxY ];
     2990
    22462991  if( uiCtxX > 3 )
    22472992  {
     
    22553000}
    22563001
    2257  /** Calculates the cost for specific absolute transform level
    2258  * \param uiAbsLevel scaled quantized level
    2259  * \param ui16CtxNumOne current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
    2260  * \param ui16CtxNumAbs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
    2261  * \param ui16CtxBase current global offset for coeff_abs_level_greater1 and coeff_abs_level_greater2
    2262  * \returns cost of given absolute transform level
    2263  */
    22643002__inline Double TComTrQuant::xGetRateSigCoef  ( UShort                          uiSignificance,
    22653003                                                UShort                          ui16CtxNumSig ) const
     
    22873025/** Context derivation process of coeff_abs_significant_flag
    22883026 * \param uiSigCoeffGroupFlag significance map of L1
    2289  * \param uiBlkX column of current scan position
    2290  * \param uiBlkY row of current scan position
    2291  * \param uiLog2BlkSize log2 value of block size
     3027 * \param uiCGPosX column of current scan position
     3028 * \param uiCGPosY row of current scan position
     3029 * \param widthInGroups width of the block
     3030 * \param heightInGroups height of the block
    22923031 * \returns ctxInc for current scan position
    22933032 */
    2294 UInt TComTrQuant::getSigCoeffGroupCtxInc  ( const UInt*               uiSigCoeffGroupFlag,
    2295                                            const UInt                      uiCGPosX,
    2296                                            const UInt                      uiCGPosY,
    2297                                            Int width, Int height)
    2298 {
    2299   UInt uiRight = 0;
    2300   UInt uiLower = 0;
    2301 
    2302   width >>= 2;
    2303   height >>= 2;
    2304   if( uiCGPosX < width - 1 )
    2305   {
    2306     uiRight = (uiSigCoeffGroupFlag[ uiCGPosY * width + uiCGPosX + 1 ] != 0);
    2307   }
    2308   if (uiCGPosY < height - 1 )
    2309   {
    2310     uiLower = (uiSigCoeffGroupFlag[ (uiCGPosY  + 1 ) * width + uiCGPosX ] != 0);
    2311   }
    2312   return (uiRight || uiLower);
    2313 
    2314 }
     3033UInt TComTrQuant::getSigCoeffGroupCtxInc  (const UInt*  uiSigCoeffGroupFlag,
     3034                                           const UInt   uiCGPosX,
     3035                                           const UInt   uiCGPosY,
     3036                                           const UInt   widthInGroups,
     3037                                           const UInt   heightInGroups)
     3038{
     3039  UInt sigRight = 0;
     3040  UInt sigLower = 0;
     3041
     3042  if (uiCGPosX < (widthInGroups  - 1))
     3043  {
     3044    sigRight = ((uiSigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
     3045  }
     3046  if (uiCGPosY < (heightInGroups - 1))
     3047  {
     3048    sigLower = ((uiSigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
     3049  }
     3050
     3051  return ((sigRight + sigLower) != 0) ? 1 : 0;
     3052}
     3053
     3054
    23153055/** set quantized matrix coefficient for encode
    2316  * \param scalingList quantaized matrix address
     3056 * \param scalingList            quantized matrix address
     3057 * \param format                 chroma format
     3058 * \param maxLog2TrDynamicRange
     3059 * \param bitDepths              reference to bit depth array for all channels
    23173060 */
    2318 Void TComTrQuant::setScalingList(TComScalingList *scalingList)
    2319 {
    2320   UInt size,list;
    2321   UInt qp;
    2322 
    2323   for(size=0;size<SCALING_LIST_SIZE_NUM;size++)
    2324   {
    2325     for(list = 0; list < g_scalingListNum[size]; list++)
    2326     {
    2327       for(qp=0;qp<SCALING_LIST_REM_NUM;qp++)
     3061Void TComTrQuant::setScalingList(TComScalingList *scalingList, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
     3062{
     3063  const Int minimumQp = 0;
     3064  const Int maximumQp = SCALING_LIST_REM_NUM;
     3065
     3066  for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
     3067  {
     3068    for(UInt list = 0; list < SCALING_LIST_NUM; list++)
     3069    {
     3070      for(Int qp = minimumQp; qp < maximumQp; qp++)
    23283071      {
    23293072        xSetScalingListEnc(scalingList,list,size,qp);
     3073        xSetScalingListDec(*scalingList,list,size,qp);
     3074        setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
     3075      }
     3076    }
     3077  }
     3078}
     3079/** set quantized matrix coefficient for decode
     3080 * \param scalingList quantized matrix address
     3081 * \param format      chroma format
     3082 */
     3083Void TComTrQuant::setScalingListDec(const TComScalingList &scalingList)
     3084{
     3085  const Int minimumQp = 0;
     3086  const Int maximumQp = SCALING_LIST_REM_NUM;
     3087
     3088  for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
     3089  {
     3090    for(UInt list = 0; list < SCALING_LIST_NUM; list++)
     3091    {
     3092      for(Int qp = minimumQp; qp < maximumQp; qp++)
     3093      {
    23303094        xSetScalingListDec(scalingList,list,size,qp);
    2331         setErrScaleCoeff(list,size,qp);
    2332       }
    2333     }
    2334   }
    2335 }
    2336 /** set quantized matrix coefficient for decode
    2337  * \param scalingList quantaized matrix address
     3095      }
     3096    }
     3097  }
     3098}
     3099/** set error scale coefficients
     3100 * \param list                   list ID
     3101 * \param size                   
     3102 * \param qp                     quantization parameter
     3103 * \param maxLog2TrDynamicRange
     3104 * \param bitDepths              reference to bit depth array for all channels
    23383105 */
    2339 Void TComTrQuant::setScalingListDec(TComScalingList *scalingList)
    2340 {
    2341   UInt size,list;
    2342   UInt qp;
    2343 
    2344   for(size=0;size<SCALING_LIST_SIZE_NUM;size++)
    2345   {
    2346     for(list = 0; list < g_scalingListNum[size]; list++)
    2347     {
    2348       for(qp=0;qp<SCALING_LIST_REM_NUM;qp++)
    2349       {
    2350         xSetScalingListDec(scalingList,list,size,qp);
    2351       }
    2352     }
    2353   }
    2354 }
    2355 /** set error scale coefficients
    2356  * \param list List ID
    2357  * \param uiSize Size
    2358  * \param uiQP Quantization parameter
    2359  */
    2360 Void TComTrQuant::setErrScaleCoeff(UInt list,UInt size, UInt qp)
    2361 {
    2362 
    2363   UInt uiLog2TrSize = g_aucConvertToBit[ g_scalingListSizeX[size] ] + 2;
    2364   Int bitDepth = (size < SCALING_LIST_32x32 && list != 0 && list != 3) ? g_bitDepthC : g_bitDepthY;
    2365   Int iTransformShift = MAX_TR_DYNAMIC_RANGE - bitDepth - uiLog2TrSize;  // Represents scaling through forward transform
     3106Void TComTrQuant::setErrScaleCoeff(UInt list, UInt size, Int qp, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
     3107{
     3108  const UInt uiLog2TrSize = g_aucConvertToBit[ g_scalingListSizeX[size] ] + 2;
     3109  const ChannelType channelType = ((list == 0) || (list == MAX_NUM_COMPONENT)) ? CHANNEL_TYPE_LUMA : CHANNEL_TYPE_CHROMA;
     3110
     3111  const Int channelBitDepth    = bitDepths.recon[channelType];
     3112  const Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange[channelType]);  // Represents scaling through forward transform
    23663113
    23673114  UInt i,uiMaxNumCoeff = g_scalingListSize[size];
     
    23713118  pdErrScale     = getErrScaleCoeff(list, size, qp);
    23723119
    2373   Double dErrScale = (Double)(1<<SCALE_BITS);                              // Compensate for scaling of bitcount in Lagrange cost function
    2374   dErrScale = dErrScale*pow(2.0,-2.0*iTransformShift);                     // Compensate for scaling through forward transform
     3120  Double dErrScale = (Double)(1<<SCALE_BITS);                                // Compensate for scaling of bitcount in Lagrange cost function
     3121  dErrScale = dErrScale*pow(2.0,(-2.0*iTransformShift));                     // Compensate for scaling through forward transform
     3122
    23753123  for(i=0;i<uiMaxNumCoeff;i++)
    23763124  {
    2377     pdErrScale[i] = dErrScale / piQuantcoeff[i] / piQuantcoeff[i] / (1<<DISTORTION_PRECISION_ADJUSTMENT(2*(bitDepth-8)));
    2378   }
     3125    pdErrScale[i] =  dErrScale / piQuantcoeff[i] / piQuantcoeff[i] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
     3126  }
     3127
     3128  getErrScaleCoeffNoScalingList(list, size, qp) = dErrScale / g_quantScales[qp] / g_quantScales[qp] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
    23793129}
    23803130
    23813131/** set quantized matrix coefficient for encode
     3132 * \param scalingList quantized matrix address
     3133 * \param listId List index
     3134 * \param sizeId size index
     3135 * \param qp Quantization parameter
     3136 * \param format chroma format
     3137 */
     3138Void TComTrQuant::xSetScalingListEnc(TComScalingList *scalingList, UInt listId, UInt sizeId, Int qp)
     3139{
     3140  UInt width  = g_scalingListSizeX[sizeId];
     3141  UInt height = g_scalingListSizeX[sizeId];
     3142  UInt ratio  = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
     3143  Int *quantcoeff;
     3144  Int *coeff  = scalingList->getScalingListAddress(sizeId,listId);
     3145  quantcoeff  = getQuantCoeff(listId, qp, sizeId);
     3146
     3147  Int quantScales = g_quantScales[qp];
     3148
     3149  processScalingListEnc(coeff,
     3150                        quantcoeff,
     3151                        (quantScales << LOG2_SCALING_LIST_NEUTRAL_VALUE),
     3152                        height, width, ratio,
     3153                        min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
     3154                        scalingList->getScalingListDC(sizeId,listId));
     3155}
     3156
     3157/** set quantized matrix coefficient for decode
    23823158 * \param scalingList quantaized matrix address
    23833159 * \param listId List index
    23843160 * \param sizeId size index
    2385  * \param uiQP Quantization parameter
     3161 * \param qp Quantization parameter
     3162 * \param format chroma format
    23863163 */
    2387 Void TComTrQuant::xSetScalingListEnc(TComScalingList *scalingList, UInt listId, UInt sizeId, UInt qp)
    2388 {
    2389   UInt width = g_scalingListSizeX[sizeId];
     3164Void TComTrQuant::xSetScalingListDec(const TComScalingList &scalingList, UInt listId, UInt sizeId, Int qp)
     3165{
     3166  UInt width  = g_scalingListSizeX[sizeId];
    23903167  UInt height = g_scalingListSizeX[sizeId];
    2391   UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
    2392   Int *quantcoeff;
    2393   Int *coeff = scalingList->getScalingListAddress(sizeId,listId);
    2394   quantcoeff   = getQuantCoeff(listId, qp, sizeId);
    2395 
    2396   processScalingListEnc(coeff,quantcoeff,g_quantScales[qp]<<4,height,width,ratio,min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]),scalingList->getScalingListDC(sizeId,listId));
    2397 }
    2398 /** set quantized matrix coefficient for decode
    2399  * \param scalingList quantaized matrix address
    2400  * \param list List index
    2401  * \param size size index
    2402  * \param uiQP Quantization parameter
    2403  */
    2404 Void TComTrQuant::xSetScalingListDec(TComScalingList *scalingList, UInt listId, UInt sizeId, UInt qp)
    2405 {
    2406   UInt width = g_scalingListSizeX[sizeId];
    2407   UInt height = g_scalingListSizeX[sizeId];
    2408   UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
     3168  UInt ratio  = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
    24093169  Int *dequantcoeff;
    2410   Int *coeff = scalingList->getScalingListAddress(sizeId,listId);
     3170  const Int *coeff  = scalingList.getScalingListAddress(sizeId,listId);
    24113171
    24123172  dequantcoeff = getDequantCoeff(listId, qp, sizeId);
    2413   processScalingListDec(coeff,dequantcoeff,g_invQuantScales[qp],height,width,ratio,min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]),scalingList->getScalingListDC(sizeId,listId));
     3173
     3174  Int invQuantScale = g_invQuantScales[qp];
     3175
     3176  processScalingListDec(coeff,
     3177                        dequantcoeff,
     3178                        invQuantScale,
     3179                        height, width, ratio,
     3180                        min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
     3181                        scalingList.getScalingListDC(sizeId,listId));
    24143182}
    24153183
    24163184/** set flat matrix value to quantized coefficient
    24173185 */
    2418 Void TComTrQuant::setFlatScalingList()
    2419 {
    2420   UInt size,list;
    2421   UInt qp;
    2422 
    2423   for(size=0;size<SCALING_LIST_SIZE_NUM;size++)
    2424   {
    2425     for(list = 0; list <  g_scalingListNum[size]; list++)
    2426     {
    2427       for(qp=0;qp<SCALING_LIST_REM_NUM;qp++)
     3186Void TComTrQuant::setFlatScalingList(const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
     3187{
     3188  const Int minimumQp = 0;
     3189  const Int maximumQp = SCALING_LIST_REM_NUM;
     3190
     3191  for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
     3192  {
     3193    for(UInt list = 0; list < SCALING_LIST_NUM; list++)
     3194    {
     3195      for(Int qp = minimumQp; qp < maximumQp; qp++)
    24283196      {
    24293197        xsetFlatScalingList(list,size,qp);
    2430         setErrScaleCoeff(list,size,qp);
     3198        setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
    24313199      }
    24323200    }
     
    24363204/** set flat matrix value to quantized coefficient
    24373205 * \param list List ID
    2438  * \param uiQP Quantization parameter
    2439  * \param uiSize Size
     3206 * \param size size index
     3207 * \param qp Quantization parameter
     3208 * \param format chroma format
    24403209 */
    2441 Void TComTrQuant::xsetFlatScalingList(UInt list, UInt size, UInt qp)
     3210Void TComTrQuant::xsetFlatScalingList(UInt list, UInt size, Int qp)
    24423211{
    24433212  UInt i,num = g_scalingListSize[size];
    24443213  Int *quantcoeff;
    24453214  Int *dequantcoeff;
    2446   Int quantScales = g_quantScales[qp];
    2447   Int invQuantScales = g_invQuantScales[qp]<<4;
     3215
     3216  Int quantScales    = g_quantScales   [qp];
     3217  Int invQuantScales = g_invQuantScales[qp] << 4;
    24483218
    24493219  quantcoeff   = getQuantCoeff(list, qp, size);
     
    24513221
    24523222  for(i=0;i<num;i++)
    2453   { 
     3223  {
    24543224    *quantcoeff++ = quantScales;
    24553225    *dequantcoeff++ = invQuantScales;
     
    24693239Void TComTrQuant::processScalingListEnc( Int *coeff, Int *quantcoeff, Int quantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
    24703240{
    2471   Int nsqth = (height < width) ? 4: 1; //height ratio for NSQT
    2472   Int nsqtw = (width < height) ? 4: 1; //width ratio for NSQT
    24733241  for(UInt j=0;j<height;j++)
    24743242  {
    24753243    for(UInt i=0;i<width;i++)
    24763244    {
    2477       quantcoeff[j*width + i] = quantScales / coeff[sizuNum * (j * nsqth / ratio) + i * nsqtw /ratio];
    2478     }
    2479   }
     3245      quantcoeff[j*width + i] = quantScales / coeff[sizuNum * (j / ratio) + i / ratio];
     3246    }
     3247  }
     3248
    24803249  if(ratio > 1)
    24813250  {
     
    24833252  }
    24843253}
     3254
    24853255/** set quantized matrix coefficient for decode
    24863256 * \param coeff quantaized matrix address
     
    24933263 * \param dc dc parameter
    24943264 */
    2495 Void TComTrQuant::processScalingListDec( Int *coeff, Int *dequantcoeff, Int invQuantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
     3265Void TComTrQuant::processScalingListDec( const Int *coeff, Int *dequantcoeff, Int invQuantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
    24963266{
    24973267  for(UInt j=0;j<height;j++)
     
    25023272    }
    25033273  }
     3274
    25043275  if(ratio > 1)
    25053276  {
     
    25143285  for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
    25153286  {
    2516     for(UInt listId = 0; listId < g_scalingListNum[sizeId]; listId++)
    2517     {
    2518       for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
    2519       {
    2520         m_quantCoef   [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
    2521         m_dequantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
     3287    for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
     3288    {
     3289      for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
     3290      {
     3291        m_quantCoef   [sizeId][listId][qp] = new Int    [g_scalingListSize[sizeId]];
     3292        m_dequantCoef [sizeId][listId][qp] = new Int    [g_scalingListSize[sizeId]];
    25223293        m_errScale    [sizeId][listId][qp] = new Double [g_scalingListSize[sizeId]];
    2523       }
    2524     }
    2525   }
    2526   // alias list [1] as [3].
    2527   for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
    2528   {
    2529     m_quantCoef   [SCALING_LIST_32x32][3][qp] = m_quantCoef   [SCALING_LIST_32x32][1][qp];
    2530     m_dequantCoef [SCALING_LIST_32x32][3][qp] = m_dequantCoef [SCALING_LIST_32x32][1][qp];
    2531     m_errScale    [SCALING_LIST_32x32][3][qp] = m_errScale    [SCALING_LIST_32x32][1][qp];
    2532   }
    2533 }
     3294      } // listID loop
     3295    }
     3296  }
     3297}
     3298
    25343299/** destroy quantization matrix array
    25353300 */
     
    25383303  for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
    25393304  {
    2540     for(UInt listId = 0; listId < g_scalingListNum[sizeId]; listId++)
     3305    for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
    25413306    {
    25423307      for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
    25433308      {
    2544         if(m_quantCoef   [sizeId][listId][qp]) delete [] m_quantCoef   [sizeId][listId][qp];
    2545         if(m_dequantCoef [sizeId][listId][qp]) delete [] m_dequantCoef [sizeId][listId][qp];
    2546         if(m_errScale    [sizeId][listId][qp]) delete [] m_errScale    [sizeId][listId][qp];
    2547       }
    2548     }
     3309        if(m_quantCoef[sizeId][listId][qp])
     3310        {
     3311          delete [] m_quantCoef[sizeId][listId][qp];
     3312        }
     3313        if(m_dequantCoef[sizeId][listId][qp])
     3314        {
     3315          delete [] m_dequantCoef[sizeId][listId][qp];
     3316        }
     3317        if(m_errScale[sizeId][listId][qp])
     3318        {
     3319          delete [] m_errScale[sizeId][listId][qp];
     3320        }
     3321      }
     3322    }
     3323  }
     3324}
     3325
     3326Void TComTrQuant::transformSkipQuantOneSample(TComTU &rTu, const ComponentID compID, const TCoeff resiDiff, TCoeff* pcCoeff, const UInt uiPos, const QpParam &cQP, const Bool bUseHalfRoundingPoint)
     3327{
     3328        TComDataCU    *pcCU                           = rTu.getCU();
     3329  const UInt           uiAbsPartIdx                   = rTu.GetAbsPartIdxTU();
     3330  const TComRectangle &rect                           = rTu.getRect(compID);
     3331  const UInt           uiWidth                        = rect.width;
     3332  const UInt           uiHeight                       = rect.height;
     3333  const Int            maxLog2TrDynamicRange          = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
     3334  const Int            channelBitDepth                = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
     3335  const Int            iTransformShift                = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
     3336  const Int            scalingListType                = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
     3337  const Bool           enableScalingLists             = getUseScalingList(uiWidth, uiHeight, true);
     3338  const Int            defaultQuantisationCoefficient = g_quantScales[cQP.rem];
     3339
     3340  assert( scalingListType < SCALING_LIST_NUM );
     3341  const Int *const piQuantCoeff = getQuantCoeff( scalingListType, cQP.rem, (rTu.GetEquivalentLog2TrSize(compID)-2) );
     3342
     3343
     3344  /* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
     3345  * implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
     3346  * uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
     3347  * Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
     3348  */
     3349
     3350  const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
     3351  // QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
     3352
     3353  const Int iAdd = ( bUseHalfRoundingPoint ? 256 : (pcCU->getSlice()->getSliceType() == I_SLICE ? 171 : 85) ) << (iQBits - 9);
     3354
     3355  TCoeff transformedCoefficient;
     3356
     3357  // transform-skip
     3358  if (iTransformShift >= 0)
     3359  {
     3360    transformedCoefficient = resiDiff << iTransformShift;
     3361  }
     3362  else // for very high bit depths
     3363  {
     3364    const Int iTrShiftNeg  = -iTransformShift;
     3365    const Int offset       = 1 << (iTrShiftNeg - 1);
     3366    transformedCoefficient = ( resiDiff + offset ) >> iTrShiftNeg;
     3367  }
     3368
     3369  // quantization
     3370  const TCoeff iSign = (transformedCoefficient < 0 ? -1: 1);
     3371
     3372  const Int quantisationCoefficient = enableScalingLists ? piQuantCoeff[uiPos] : defaultQuantisationCoefficient;
     3373
     3374  const Int64 tmpLevel = (Int64)abs(transformedCoefficient) * quantisationCoefficient;
     3375
     3376  const TCoeff quantisedCoefficient = (TCoeff((tmpLevel + iAdd ) >> iQBits)) * iSign;
     3377
     3378  const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
     3379  const TCoeff entropyCodingMaximum =  (1 << maxLog2TrDynamicRange) - 1;
     3380  pcCoeff[ uiPos ] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
     3381}
     3382
     3383
     3384Void TComTrQuant::invTrSkipDeQuantOneSample( TComTU &rTu, ComponentID compID, TCoeff inSample, Pel &reconSample, const QpParam &cQP, UInt uiPos )
     3385{
     3386        TComDataCU    *pcCU               = rTu.getCU();
     3387  const UInt           uiAbsPartIdx       = rTu.GetAbsPartIdxTU();
     3388  const TComRectangle &rect               = rTu.getRect(compID);
     3389  const UInt           uiWidth            = rect.width;
     3390  const UInt           uiHeight           = rect.height;
     3391  const Int            QP_per             = cQP.per;
     3392  const Int            QP_rem             = cQP.rem;
     3393  const Int            maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
     3394#if O0043_BEST_EFFORT_DECODING
     3395  const Int            channelBitDepth    = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
     3396#else
     3397  const Int            channelBitDepth    = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
     3398#endif
     3399  const Int            iTransformShift    = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
     3400  const Int            scalingListType    = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
     3401  const Bool           enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
     3402  const UInt           uiLog2TrSize       = rTu.GetEquivalentLog2TrSize(compID);
     3403
     3404  assert( scalingListType < SCALING_LIST_NUM );
     3405
     3406  const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
     3407
     3408  const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
     3409  const TCoeff transformMaximum =  (1 << maxLog2TrDynamicRange) - 1;
     3410
     3411  // Dequantisation
     3412
     3413  TCoeff dequantisedSample;
     3414
     3415  if(enableScalingLists)
     3416  {
     3417    const UInt             dequantCoefBits     = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
     3418    const UInt             targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
     3419
     3420    const Intermediate_Int inputMinimum        = -(1 << (targetInputBitDepth - 1));
     3421    const Intermediate_Int inputMaximum        =  (1 << (targetInputBitDepth - 1)) - 1;
     3422
     3423    Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
     3424
     3425    if(rightShift > 0)
     3426    {
     3427      const Intermediate_Int iAdd      = 1 << (rightShift - 1);
     3428      const TCoeff           clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
     3429      const Intermediate_Int iCoeffQ   = ((Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) + iAdd ) >> rightShift;
     3430
     3431      dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
     3432    }
     3433    else
     3434    {
     3435      const Int              leftShift = -rightShift;
     3436      const TCoeff           clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
     3437      const Intermediate_Int iCoeffQ   = (Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) << leftShift;
     3438
     3439      dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
     3440    }
     3441  }
     3442  else
     3443  {
     3444    const Int scale     =  g_invQuantScales[QP_rem];
     3445    const Int scaleBits =     (IQUANT_SHIFT + 1)   ;
     3446
     3447    const UInt             targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
     3448    const Intermediate_Int inputMinimum        = -(1 << (targetInputBitDepth - 1));
     3449    const Intermediate_Int inputMaximum        =  (1 << (targetInputBitDepth - 1)) - 1;
     3450
     3451    if (rightShift > 0)
     3452    {
     3453      const Intermediate_Int iAdd      = 1 << (rightShift - 1);
     3454      const TCoeff           clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
     3455      const Intermediate_Int iCoeffQ   = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
     3456
     3457      dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
     3458    }
     3459    else
     3460    {
     3461      const Int              leftShift = -rightShift;
     3462      const TCoeff           clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
     3463      const Intermediate_Int iCoeffQ   = (Intermediate_Int(clipQCoef) * scale) << leftShift;
     3464
     3465      dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
     3466    }
     3467  }
     3468
     3469  // Inverse transform-skip
     3470
     3471  if (iTransformShift >= 0)
     3472  {
     3473    const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
     3474    reconSample =  Pel(( dequantisedSample + offset ) >> iTransformShift);
     3475  }
     3476  else //for very high bit depths
     3477  {
     3478    const Int iTrShiftNeg = -iTransformShift;
     3479    reconSample = Pel(dequantisedSample << iTrShiftNeg);
     3480  }
     3481}
     3482
     3483
     3484Void TComTrQuant::crossComponentPrediction(       TComTU      & rTu,
     3485                                            const ComponentID   compID,
     3486                                            const Pel         * piResiL,
     3487                                            const Pel         * piResiC,
     3488                                                  Pel         * piResiT,
     3489                                            const Int           width,
     3490                                            const Int           height,
     3491                                            const Int           strideL,
     3492                                            const Int           strideC,
     3493                                            const Int           strideT,
     3494                                            const Bool          reverse )
     3495{
     3496  const Pel *pResiL = piResiL;
     3497  const Pel *pResiC = piResiC;
     3498        Pel *pResiT = piResiT;
     3499
     3500  TComDataCU *pCU = rTu.getCU();
     3501  const Int alpha = pCU->getCrossComponentPredictionAlpha( rTu.GetAbsPartIdxTU( compID ), compID );
     3502  const Int diffBitDepth = pCU->getSlice()->getSPS()->getDifferentialLumaChromaBitDepth();
     3503
     3504  for( Int y = 0; y < height; y++ )
     3505  {
     3506    if (reverse)
     3507    {
     3508      // A constraint is to be added to the HEVC Standard to limit the size of pResiL and pResiC at this point.
     3509      // The likely form of the constraint is to either restrict the values to CoeffMin to CoeffMax,
     3510      // or to be representable in a bitDepthY+4 or bitDepthC+4 signed integer.
     3511      //  The result of the constraint is that for 8/10/12bit profiles, the input values
     3512      //  can be represented within a 16-bit Pel-type.
     3513#if RExt__HIGH_BIT_DEPTH_SUPPORT
     3514      for( Int x = 0; x < width; x++ )
     3515      {
     3516        pResiT[x] = pResiC[x] + (( alpha * rightShift( pResiL[x], diffBitDepth) ) >> 3);
     3517      }
     3518#else
     3519      const Int minPel=std::numeric_limits<Pel>::min();
     3520      const Int maxPel=std::numeric_limits<Pel>::max();
     3521      for( Int x = 0; x < width; x++ )
     3522      {
     3523        pResiT[x] = Clip3<Int>(minPel, maxPel, pResiC[x] + (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3));
     3524      }
     3525#endif
     3526    }
     3527    else
     3528    {
     3529      // Forward does not need clipping. Pel type should always be big enough.
     3530      for( Int x = 0; x < width; x++ )
     3531      {
     3532        pResiT[x] = pResiC[x] - (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3);
     3533      }
     3534    }
     3535
     3536    pResiL += strideL;
     3537    pResiC += strideC;
     3538    pResiT += strideT;
    25493539  }
    25503540}
Note: See TracChangeset for help on using the changeset viewer.