Context navigation

TComTrQuant.cpp @ 1417

Visit:

Last change on this file since 1417 was 1413, checked in by tech, 6 years ago
Merged HTM-16.2-dev@1412
Property svn:eol-style set to `native`
File size: 130.3 KB

Rev	Line
[5]	1	/* The copyright in this software is being made available under the BSD
	2	* License, included below. This software may be subject to other third party
	3	* and contributor rights, including patent rights, and no such rights are
[1313]	4	* granted under this license.
[5]	5	*
[1413]	6	* Copyright (c) 2010-2017, ITU/ISO/IEC
[5]	7	* All rights reserved.
	8	*
	9	* Redistribution and use in source and binary forms, with or without
	10	* modification, are permitted provided that the following conditions are met:
	11	*
	12	* * Redistributions of source code must retain the above copyright notice,
	13	* this list of conditions and the following disclaimer.
	14	* * Redistributions in binary form must reproduce the above copyright notice,
	15	* this list of conditions and the following disclaimer in the documentation
	16	* and/or other materials provided with the distribution.
[56]	17	* * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
[5]	18	* be used to endorse or promote products derived from this software without
	19	* specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	22	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	23	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	24	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
	25	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	26	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	27	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	28	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	29	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	30	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	31	* THE POSSIBILITY OF SUCH DAMAGE.
	32	*/
[2]	33
	34	/** \file TComTrQuant.cpp
	35	\brief transform and quantization class
	36	*/
	37
	38	#include <stdlib.h>
	39	#include <math.h>
[1313]	40	#include <limits>
[2]	41	#include <memory.h>
	42	#include "TComTrQuant.h"
	43	#include "TComPic.h"
	44	#include "ContextTables.h"
[1313]	45	#include "TComTU.h"
	46	#include "Debug.h"
[2]	47
[56]	48	typedef struct
	49	{
	50	Int iNNZbeforePos0;
	51	Double d64CodedLevelandDist; // distortion and level cost only
	52	Double d64UncodedDist; // all zero coded block distortion
	53	Double d64SigCost;
	54	Double d64SigCost_0;
	55	} coeffGroupRDStats;
	56
	57	//! \ingroup TLibCommon
	58	//! \{
	59
[2]	60	// ====================================================================================================================
	61	// Constants
	62	// ====================================================================================================================
	63
	64	#define RDOQ_CHROMA 1 ///< use of RDOQ in chroma
	65
[1313]	66
[2]	67	// ====================================================================================================================
[1313]	68	// QpParam constructor
[2]	69	// ====================================================================================================================
	70
[1313]	71	QpParam::QpParam(const Int qpy,
	72	const ChannelType chType,
	73	const Int qpBdOffset,
	74	const Int chromaQPOffset,
	75	const ChromaFormat chFmt )
	76	{
	77	Int baseQp;
[2]	78
[1313]	79	if(isLuma(chType))
	80	{
	81	baseQp = qpy + qpBdOffset;
	82	}
	83	else
	84	{
	85	baseQp = Clip3( -qpBdOffset, (chromaQPMappingTableSize - 1), qpy + chromaQPOffset );
[2]	86
[1313]	87	if(baseQp < 0)
	88	{
	89	baseQp = baseQp + qpBdOffset;
	90	}
	91	else
	92	{
	93	baseQp = getScaledChromaQP(baseQp, chFmt) + qpBdOffset;
	94	}
	95	}
	96
	97	Qp =baseQp;
	98	per=baseQp/6;
	99	rem=baseQp%6;
	100	}
	101
	102	QpParam::QpParam(const TComDataCU &cu, const ComponentID compID)
[2]	103	{
[1313]	104	Int chromaQpOffset = 0;
	105
	106	if (isChroma(compID))
	107	{
	108	chromaQpOffset += cu.getSlice()->getPPS()->getQpOffset(compID);
	109	chromaQpOffset += cu.getSlice()->getSliceChromaQpDelta(compID);
	110
	111	chromaQpOffset += cu.getSlice()->getPPS()->getPpsRangeExtension().getChromaQpOffsetListEntry(cu.getChromaQpAdj(0)).u.offset[Int(compID)-1];
	112	}
	113
	114	*this = QpParam(cu.getQP( 0 ),
	115	toChannelType(compID),
	116	cu.getSlice()->getSPS()->getQpBDOffset(toChannelType(compID)),
	117	chromaQpOffset,
	118	cu.getPic()->getChromaFormat());
[2]	119	}
	120
[1313]	121
[2]	122	// ====================================================================================================================
	123	// TComTrQuant class member functions
	124	// ====================================================================================================================
	125
	126	TComTrQuant::TComTrQuant()
	127	{
	128	// allocate temporary buffers
[1313]	129	m_plTempCoeff = new TCoeff[ MAX_CU_SIZE*MAX_CU_SIZE ];
	130
[2]	131	// allocate bit estimation class (for RDOQ)
	132	m_pcEstBitsSbac = new estBitsSbacStruct;
[56]	133	initScalingList();
[2]	134	}
	135
	136	TComTrQuant::~TComTrQuant()
	137	{
	138	// delete temporary buffers
	139	if ( m_plTempCoeff )
	140	{
	141	delete [] m_plTempCoeff;
	142	m_plTempCoeff = NULL;
	143	}
[1313]	144
[2]	145	// delete bit estimation class
[56]	146	if ( m_pcEstBitsSbac )
	147	{
	148	delete m_pcEstBitsSbac;
	149	}
	150	destroyScalingList();
[2]	151	}
	152
[56]	153	#if ADAPTIVE_QP_SELECTION
	154	Void TComTrQuant::storeSliceQpNext(TComSlice* pcSlice)
	155	{
[1313]	156	// NOTE: does this work with negative QPs or when some blocks are transquant-bypass enabled?
	157
[56]	158	Int qpBase = pcSlice->getSliceQpBase();
	159	Int sliceQpused = pcSlice->getSliceQp();
	160	Int sliceQpnext;
	161	Double alpha = qpBase < 17 ? 0.5 : 1;
[1313]	162
[56]	163	Int cnt=0;
[608]	164	for(Int u=1; u<=LEVEL_RANGE; u++)
[1313]	165	{
[56]	166	cnt += m_sliceNsamples[u] ;
	167	}
	168
[608]	169	if( !m_useRDOQ )
[56]	170	{
	171	sliceQpused = qpBase;
	172	alpha = 0.5;
	173	}
	174
	175	if( cnt > 120 )
	176	{
	177	Double sum = 0;
	178	Int k = 0;
	179	for(Int u=1; u<LEVEL_RANGE; u++)
	180	{
	181	sum += u*m_sliceSumC[u];
	182	k += uum_sliceNsamples[u];
	183	}
	184
	185	Int v;
	186	Double q[MAX_QP+1] ;
	187	for(v=0; v<=MAX_QP; v++)
	188	{
	189	q[v] = (Double)(g_invQuantScales[v%6] * (1<<(v/6)))/64 ;
	190	}
	191
	192	Double qnext = sum/k * q[sliceQpused] / (1<<ARL_C_PRECISION);
	193
	194	for(v=0; v<MAX_QP; v++)
	195	{
	196	if(qnext < alpha * q[v] + (1 - alpha) * q[v+1] )
	197	{
	198	break;
	199	}
	200	}
	201	sliceQpnext = Clip3(sliceQpused - 3, sliceQpused + 3, v);
	202	}
	203	else
	204	{
	205	sliceQpnext = sliceQpused;
	206	}
	207
[1313]	208	m_qpDelta[qpBase] = sliceQpnext - qpBase;
[56]	209	}
	210
	211	Void TComTrQuant::initSliceQpDelta()
	212	{
	213	for(Int qp=0; qp<=MAX_QP; qp++)
	214	{
	215	m_qpDelta[qp] = qp < 17 ? 0 : 1;
	216	}
	217	}
	218
	219	Void TComTrQuant::clearSliceARLCnt()
[1313]	220	{
[56]	221	memset(m_sliceSumC, 0, sizeof(Double)*(LEVEL_RANGE+1));
	222	memset(m_sliceNsamples, 0, sizeof(Int)*(LEVEL_RANGE+1));
	223	}
	224	#endif
	225
	226
	227
[2]	228	#if MATRIX_MULT
	229	/** NxN forward transform (2D) using brute force matrix multiplication (3 nested loops)
	230	* \param block pointer to input data (residual)
	231	* \param coeff pointer to output data (transform coefficients)
	232	* \param uiStride stride of input data
	233	* \param uiTrSize transform size (uiTrSize x uiTrSize)
	234	* \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
	235	*/
[1313]	236	Void xTr(Int bitDepth, Pel block, TCoeff coeff, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
[2]	237	{
[1313]	238	UInt i,j,k;
	239	TCoeff iSum;
	240	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
	241	const TMatrixCoeff *iT;
[2]	242	UInt uiLog2TrSize = g_aucConvertToBit[ uiTrSize ] + 2;
	243
	244	if (uiTrSize==4)
	245	{
[1313]	246	iT = (useDST ? g_as_DST_MAT_4[TRANSFORM_FORWARD][0] : g_aiT4[TRANSFORM_FORWARD][0]);
[2]	247	}
	248	else if (uiTrSize==8)
	249	{
[1313]	250	iT = g_aiT8[TRANSFORM_FORWARD][0];
[2]	251	}
	252	else if (uiTrSize==16)
	253	{
[1313]	254	iT = g_aiT16[TRANSFORM_FORWARD][0];
[2]	255	}
	256	else if (uiTrSize==32)
	257	{
[1313]	258	iT = g_aiT32[TRANSFORM_FORWARD][0];
[2]	259	}
[56]	260	else
	261	{
[2]	262	assert(0);
	263	}
	264
[1313]	265	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
[2]	266
[1313]	267	const Int shift_1st = (uiLog2TrSize + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
	268	const Int shift_2nd = uiLog2TrSize + TRANSFORM_MATRIX_SHIFT;
	269	const Int add_1st = (shift_1st>0) ? (1<<(shift_1st-1)) : 0;
	270	const Int add_2nd = 1<<(shift_2nd-1);
	271
[2]	272	/* Horizontal transform */
	273
	274	for (i=0; i<uiTrSize; i++)
	275	{
	276	for (j=0; j<uiTrSize; j++)
	277	{
	278	iSum = 0;
	279	for (k=0; k<uiTrSize; k++)
	280	{
	281	iSum += iT[iuiTrSize+k]block[j*uiStride+k];
	282	}
	283	tmp[i*uiTrSize+j] = (iSum + add_1st)>>shift_1st;
	284	}
	285	}
[1313]	286
[56]	287	/* Vertical transform */
[1313]	288	for (i=0; i<uiTrSize; i++)
[2]	289	{
	290	for (j=0; j<uiTrSize; j++)
	291	{
	292	iSum = 0;
	293	for (k=0; k<uiTrSize; k++)
	294	{
[1313]	295	iSum += iT[iuiTrSize+k]tmp[j*uiTrSize+k];
[2]	296	}
[1313]	297	coeff[i*uiTrSize+j] = (iSum + add_2nd)>>shift_2nd;
[2]	298	}
[56]	299	}
[2]	300	}
	301
	302	/** NxN inverse transform (2D) using brute force matrix multiplication (3 nested loops)
	303	* \param coeff pointer to input data (transform coefficients)
	304	* \param block pointer to output data (residual)
	305	* \param uiStride stride of output data
	306	* \param uiTrSize transform size (uiTrSize x uiTrSize)
	307	* \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
	308	*/
[1313]	309	Void xITr(Int bitDepth, TCoeff coeff, Pel block, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
[2]	310	{
[1313]	311	UInt i,j,k;
	312	TCoeff iSum;
	313	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
	314	const TMatrixCoeff *iT;
	315
[2]	316	if (uiTrSize==4)
	317	{
[1313]	318	iT = (useDST ? g_as_DST_MAT_4[TRANSFORM_INVERSE][0] : g_aiT4[TRANSFORM_INVERSE][0]);
[2]	319	}
	320	else if (uiTrSize==8)
	321	{
[1313]	322	iT = g_aiT8[TRANSFORM_INVERSE][0];
[2]	323	}
	324	else if (uiTrSize==16)
	325	{
[1313]	326	iT = g_aiT16[TRANSFORM_INVERSE][0];
[2]	327	}
	328	else if (uiTrSize==32)
	329	{
[1313]	330	iT = g_aiT32[TRANSFORM_INVERSE][0];
[2]	331	}
[56]	332	else
	333	{
[2]	334	assert(0);
	335	}
[1313]	336
	337	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
	338
	339	const Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
	340	const Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
	341	const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
	342	const TCoeff clipMaximum = (1 << maxLog2TrDynamicRange) - 1;
	343	assert(shift_2nd>=0);
	344	const Int add_1st = 1<<(shift_1st-1);
	345	const Int add_2nd = (shift_2nd>0) ? (1<<(shift_2nd-1)) : 0;
	346
[2]	347	/* Horizontal transform */
	348	for (i=0; i<uiTrSize; i++)
[1313]	349	{
[2]	350	for (j=0; j<uiTrSize; j++)
	351	{
	352	iSum = 0;
	353	for (k=0; k<uiTrSize; k++)
[1313]	354	{
	355	iSum += iT[kuiTrSize+i]coeff[k*uiTrSize+j];
[2]	356	}
[1313]	357
	358	// Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
	359	tmp[i*uiTrSize+j] = Clip3<TCoeff>(clipMinimum, clipMaximum, (iSum + add_1st)>>shift_1st);
[2]	360	}
	361	}
[1313]	362
[2]	363	/* Vertical transform */
	364	for (i=0; i<uiTrSize; i++)
[1313]	365	{
[2]	366	for (j=0; j<uiTrSize; j++)
	367	{
	368	iSum = 0;
	369	for (k=0; k<uiTrSize; k++)
[1313]	370	{
[2]	371	iSum += iT[kuiTrSize+j]tmp[i*uiTrSize+k];
	372	}
[1313]	373
	374	block[i*uiStride+j] = Clip3<TCoeff>(std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max(), (iSum + add_2nd)>>shift_2nd);
[2]	375	}
	376	}
	377	}
	378
[1313]	379	#endif //MATRIX_MULT
[2]	380
[1313]	381
[2]	382	/** 4x4 forward transform implemented using partial butterfly structure (1D)
[56]	383	* \param src input data (residual)
	384	* \param dst output data (transform coefficients)
[2]	385	* \param shift specifies right shift after 1D transform
[1313]	386	* \param line
[2]	387	*/
[1313]	388	Void partialButterfly4(TCoeff src, TCoeff dst, Int shift, Int line)
[56]	389	{
[608]	390	Int j;
[1313]	391	TCoeff E[2],O[2];
	392	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	393
	394	for (j=0; j<line; j++)
[1313]	395	{
[56]	396	/* E and O */
	397	E[0] = src[0] + src[3];
	398	O[0] = src[0] - src[3];
	399	E[1] = src[1] + src[2];
	400	O[1] = src[1] - src[2];
	401
[1313]	402	dst[0] = (g_aiT4[TRANSFORM_FORWARD][0][0]E[0] + g_aiT4[TRANSFORM_FORWARD][0][1]E[1] + add)>>shift;
	403	dst[2line] = (g_aiT4[TRANSFORM_FORWARD][2][0]E[0] + g_aiT4[TRANSFORM_FORWARD][2][1]*E[1] + add)>>shift;
	404	dst[line] = (g_aiT4[TRANSFORM_FORWARD][1][0]O[0] + g_aiT4[TRANSFORM_FORWARD][1][1]O[1] + add)>>shift;
	405	dst[3line] = (g_aiT4[TRANSFORM_FORWARD][3][0]O[0] + g_aiT4[TRANSFORM_FORWARD][3][1]*O[1] + add)>>shift;
[56]	406
	407	src += 4;
	408	dst ++;
	409	}
	410	}
	411
[1313]	412	// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
[2]	413	// give identical results
[1313]	414	Void fastForwardDst(TCoeff block, TCoeff coeff, Int shift) // input block, output coeff
[2]	415	{
[1313]	416	Int i;
	417	TCoeff c[4];
	418	TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
[2]	419	for (i=0; i<4; i++)
	420	{
	421	// Intermediate Variables
[1313]	422	c[0] = block[4*i+0];
	423	c[1] = block[4*i+1];
	424	c[2] = block[4*i+2];
	425	c[3] = block[4*i+3];
[56]	426
[1313]	427	for (Int row = 0; row < 4; row++)
	428	{
	429	TCoeff result = 0;
	430	for (Int column = 0; column < 4; column++)
	431	{
	432	result += c[column] * g_as_DST_MAT_4[TRANSFORM_FORWARD][row][column]; // use the defined matrix, rather than hard-wired numbers
	433	}
	434
	435	coeff[(row * 4) + i] = rightShift((result + rnd_factor), shift);
	436	}
[2]	437	}
	438	}
[56]	439
[1313]	440	Void fastInverseDst(TCoeff tmp, TCoeff block, Int shift, const TCoeff outputMinimum, const TCoeff outputMaximum) // input tmp, output block
[2]	441	{
[1313]	442	Int i;
	443	TCoeff c[4];
	444	TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
[2]	445	for (i=0; i<4; i++)
[1313]	446	{
[2]	447	// Intermediate Variables
[1313]	448	c[0] = tmp[ i];
	449	c[1] = tmp[4 +i];
	450	c[2] = tmp[8 +i];
	451	c[3] = tmp[12+i];
[56]	452
[1313]	453	for (Int column = 0; column < 4; column++)
	454	{
	455	TCoeff &result = block[(i * 4) + column];
	456
	457	result = 0;
	458	for (Int row = 0; row < 4; row++)
	459	{
	460	result += c[row] * g_as_DST_MAT_4[TRANSFORM_INVERSE][row][column]; // use the defined matrix, rather than hard-wired numbers
	461	}
	462
	463	result = Clip3( outputMinimum, outputMaximum, rightShift((result + rnd_factor), shift));
	464	}
[2]	465	}
	466	}
[56]	467
[1313]	468	/** 4x4 inverse transform implemented using partial butterfly structure (1D)
	469	* \param src input data (transform coefficients)
	470	* \param dst output data (residual)
	471	* \param shift specifies right shift after 1D transform
	472	* \param line
	473	* \param outputMinimum minimum for clipping
	474	* \param outputMaximum maximum for clipping
	475	*/
	476	Void partialButterflyInverse4(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
[56]	477	{
[608]	478	Int j;
[1313]	479	TCoeff E[2],O[2];
	480	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	481
	482	for (j=0; j<line; j++)
[1313]	483	{
	484	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
	485	O[0] = g_aiT4[TRANSFORM_INVERSE][1][0]src[line] + g_aiT4[TRANSFORM_INVERSE][3][0]src[3*line];
	486	O[1] = g_aiT4[TRANSFORM_INVERSE][1][1]src[line] + g_aiT4[TRANSFORM_INVERSE][3][1]src[3*line];
	487	E[0] = g_aiT4[TRANSFORM_INVERSE][0][0]src[0] + g_aiT4[TRANSFORM_INVERSE][2][0]src[2*line];
	488	E[1] = g_aiT4[TRANSFORM_INVERSE][0][1]src[0] + g_aiT4[TRANSFORM_INVERSE][2][1]src[2*line];
[56]	489
	490	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
[1313]	491	dst[0] = Clip3( outputMinimum, outputMaximum, (E[0] + O[0] + add)>>shift );
	492	dst[1] = Clip3( outputMinimum, outputMaximum, (E[1] + O[1] + add)>>shift );
	493	dst[2] = Clip3( outputMinimum, outputMaximum, (E[1] - O[1] + add)>>shift );
	494	dst[3] = Clip3( outputMinimum, outputMaximum, (E[0] - O[0] + add)>>shift );
	495
[56]	496	src ++;
	497	dst += 4;
	498	}
	499	}
	500
[1313]	501	/** 8x8 forward transform implemented using partial butterfly structure (1D)
	502	* \param src input data (residual)
	503	* \param dst output data (transform coefficients)
	504	* \param shift specifies right shift after 1D transform
	505	* \param line
	506	*/
	507	Void partialButterfly8(TCoeff src, TCoeff dst, Int shift, Int line)
[56]	508	{
[608]	509	Int j,k;
[1313]	510	TCoeff E[4],O[4];
	511	TCoeff EE[2],EO[2];
	512	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	513
	514	for (j=0; j<line; j++)
[1313]	515	{
[56]	516	/* E and O*/
	517	for (k=0;k<4;k++)
	518	{
	519	E[k] = src[k] + src[7-k];
	520	O[k] = src[k] - src[7-k];
[1313]	521	}
[56]	522	/* EE and EO */
[1313]	523	EE[0] = E[0] + E[3];
[56]	524	EO[0] = E[0] - E[3];
	525	EE[1] = E[1] + E[2];
	526	EO[1] = E[1] - E[2];
	527
[1313]	528	dst[0] = (g_aiT8[TRANSFORM_FORWARD][0][0]EE[0] + g_aiT8[TRANSFORM_FORWARD][0][1]EE[1] + add)>>shift;
	529	dst[4line] = (g_aiT8[TRANSFORM_FORWARD][4][0]EE[0] + g_aiT8[TRANSFORM_FORWARD][4][1]*EE[1] + add)>>shift;
	530	dst[2line] = (g_aiT8[TRANSFORM_FORWARD][2][0]EO[0] + g_aiT8[TRANSFORM_FORWARD][2][1]*EO[1] + add)>>shift;
	531	dst[6line] = (g_aiT8[TRANSFORM_FORWARD][6][0]EO[0] + g_aiT8[TRANSFORM_FORWARD][6][1]*EO[1] + add)>>shift;
[56]	532
[1313]	533	dst[line] = (g_aiT8[TRANSFORM_FORWARD][1][0]O[0] + g_aiT8[TRANSFORM_FORWARD][1][1]O[1] + g_aiT8[TRANSFORM_FORWARD][1][2]O[2] + g_aiT8[TRANSFORM_FORWARD][1][3]O[3] + add)>>shift;
	534	dst[3line] = (g_aiT8[TRANSFORM_FORWARD][3][0]O[0] + g_aiT8[TRANSFORM_FORWARD][3][1]O[1] + g_aiT8[TRANSFORM_FORWARD][3][2]O[2] + g_aiT8[TRANSFORM_FORWARD][3][3]*O[3] + add)>>shift;
	535	dst[5line] = (g_aiT8[TRANSFORM_FORWARD][5][0]O[0] + g_aiT8[TRANSFORM_FORWARD][5][1]O[1] + g_aiT8[TRANSFORM_FORWARD][5][2]O[2] + g_aiT8[TRANSFORM_FORWARD][5][3]*O[3] + add)>>shift;
	536	dst[7line] = (g_aiT8[TRANSFORM_FORWARD][7][0]O[0] + g_aiT8[TRANSFORM_FORWARD][7][1]O[1] + g_aiT8[TRANSFORM_FORWARD][7][2]O[2] + g_aiT8[TRANSFORM_FORWARD][7][3]*O[3] + add)>>shift;
[56]	537
	538	src += 8;
	539	dst ++;
	540	}
	541	}
	542
[1313]	543	/** 8x8 inverse transform implemented using partial butterfly structure (1D)
	544	* \param src input data (transform coefficients)
	545	* \param dst output data (residual)
	546	* \param shift specifies right shift after 1D transform
	547	* \param line
	548	* \param outputMinimum minimum for clipping
	549	* \param outputMaximum maximum for clipping
	550	*/
	551	Void partialButterflyInverse8(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
[56]	552	{
[608]	553	Int j,k;
[1313]	554	TCoeff E[4],O[4];
	555	TCoeff EE[2],EO[2];
	556	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	557
[1313]	558	for (j=0; j<line; j++)
	559	{
[56]	560	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
	561	for (k=0;k<4;k++)
	562	{
[1313]	563	O[k] = g_aiT8[TRANSFORM_INVERSE][ 1][k]src[line] + g_aiT8[TRANSFORM_INVERSE][ 3][k]src[3*line] +
	564	g_aiT8[TRANSFORM_INVERSE][ 5][k]src[5line] + g_aiT8[TRANSFORM_INVERSE][ 7][k]src[7line];
[56]	565	}
	566
[1313]	567	EO[0] = g_aiT8[TRANSFORM_INVERSE][2][0]src[ 2line ] + g_aiT8[TRANSFORM_INVERSE][6][0]src[ 6line ];
	568	EO[1] = g_aiT8[TRANSFORM_INVERSE][2][1]src[ 2line ] + g_aiT8[TRANSFORM_INVERSE][6][1]src[ 6line ];
	569	EE[0] = g_aiT8[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT8[TRANSFORM_INVERSE][4][0]src[ 4*line ];
	570	EE[1] = g_aiT8[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT8[TRANSFORM_INVERSE][4][1]src[ 4*line ];
[56]	571
[1313]	572	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
[56]	573	E[0] = EE[0] + EO[0];
	574	E[3] = EE[0] - EO[0];
	575	E[1] = EE[1] + EO[1];
	576	E[2] = EE[1] - EO[1];
	577	for (k=0;k<4;k++)
	578	{
[1313]	579	dst[ k ] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
	580	dst[ k+4 ] = Clip3( outputMinimum, outputMaximum, (E[3-k] - O[3-k] + add)>>shift );
	581	}
[56]	582	src ++;
	583	dst += 8;
	584	}
	585	}
	586
[1313]	587	/** 16x16 forward transform implemented using partial butterfly structure (1D)
	588	* \param src input data (residual)
	589	* \param dst output data (transform coefficients)
	590	* \param shift specifies right shift after 1D transform
	591	* \param line
	592	*/
	593	Void partialButterfly16(TCoeff src, TCoeff dst, Int shift, Int line)
[56]	594	{
[608]	595	Int j,k;
[1313]	596	TCoeff E[8],O[8];
	597	TCoeff EE[4],EO[4];
	598	TCoeff EEE[2],EEO[2];
	599	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	600
[1313]	601	for (j=0; j<line; j++)
	602	{
[56]	603	/* E and O*/
	604	for (k=0;k<8;k++)
	605	{
	606	E[k] = src[k] + src[15-k];
	607	O[k] = src[k] - src[15-k];
[1313]	608	}
[56]	609	/* EE and EO */
	610	for (k=0;k<4;k++)
	611	{
	612	EE[k] = E[k] + E[7-k];
	613	EO[k] = E[k] - E[7-k];
	614	}
	615	/* EEE and EEO */
[1313]	616	EEE[0] = EE[0] + EE[3];
[56]	617	EEO[0] = EE[0] - EE[3];
	618	EEE[1] = EE[1] + EE[2];
	619	EEO[1] = EE[1] - EE[2];
	620
[1313]	621	dst[ 0 ] = (g_aiT16[TRANSFORM_FORWARD][ 0][0]EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 0][1]EEE[1] + add)>>shift;
	622	dst[ 8line ] = (g_aiT16[TRANSFORM_FORWARD][ 8][0]EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 8][1]*EEE[1] + add)>>shift;
	623	dst[ 4line ] = (g_aiT16[TRANSFORM_FORWARD][ 4][0]EEO[0] + g_aiT16[TRANSFORM_FORWARD][ 4][1]*EEO[1] + add)>>shift;
	624	dst[ 12line] = (g_aiT16[TRANSFORM_FORWARD][12][0]EEO[0] + g_aiT16[TRANSFORM_FORWARD][12][1]*EEO[1] + add)>>shift;
[56]	625
	626	for (k=2;k<16;k+=4)
	627	{
[1313]	628	dst[ kline ] = (g_aiT16[TRANSFORM_FORWARD][k][0]EO[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*EO[1] +
	629	g_aiT16[TRANSFORM_FORWARD][k][2]EO[2] + g_aiT16[TRANSFORM_FORWARD][k][3]EO[3] + add)>>shift;
[56]	630	}
	631
	632	for (k=1;k<16;k+=2)
	633	{
[1313]	634	dst[ kline ] = (g_aiT16[TRANSFORM_FORWARD][k][0]O[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*O[1] +
	635	g_aiT16[TRANSFORM_FORWARD][k][2]O[2] + g_aiT16[TRANSFORM_FORWARD][k][3]O[3] +
	636	g_aiT16[TRANSFORM_FORWARD][k][4]O[4] + g_aiT16[TRANSFORM_FORWARD][k][5]O[5] +
	637	g_aiT16[TRANSFORM_FORWARD][k][6]O[6] + g_aiT16[TRANSFORM_FORWARD][k][7]O[7] + add)>>shift;
[56]	638	}
	639
	640	src += 16;
[1313]	641	dst ++;
[56]	642
	643	}
	644	}
	645
[1313]	646	/** 16x16 inverse transform implemented using partial butterfly structure (1D)
	647	* \param src input data (transform coefficients)
	648	* \param dst output data (residual)
	649	* \param shift specifies right shift after 1D transform
	650	* \param line
	651	* \param outputMinimum minimum for clipping
	652	* \param outputMaximum maximum for clipping
	653	*/
	654	Void partialButterflyInverse16(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
[56]	655	{
[608]	656	Int j,k;
[1313]	657	TCoeff E[8],O[8];
	658	TCoeff EE[4],EO[4];
	659	TCoeff EEE[2],EEO[2];
	660	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	661
	662	for (j=0; j<line; j++)
[1313]	663	{
[56]	664	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
	665	for (k=0;k<8;k++)
	666	{
[1313]	667	O[k] = g_aiT16[TRANSFORM_INVERSE][ 1][k]src[ line] + g_aiT16[TRANSFORM_INVERSE][ 3][k]src[ 3*line] +
	668	g_aiT16[TRANSFORM_INVERSE][ 5][k]src[ 5line] + g_aiT16[TRANSFORM_INVERSE][ 7][k]src[ 7line] +
	669	g_aiT16[TRANSFORM_INVERSE][ 9][k]src[ 9line] + g_aiT16[TRANSFORM_INVERSE][11][k]src[11line] +
	670	g_aiT16[TRANSFORM_INVERSE][13][k]src[13line] + g_aiT16[TRANSFORM_INVERSE][15][k]src[15line];
[56]	671	}
	672	for (k=0;k<4;k++)
	673	{
[1313]	674	EO[k] = g_aiT16[TRANSFORM_INVERSE][ 2][k]src[ 2line] + g_aiT16[TRANSFORM_INVERSE][ 6][k]src[ 6line] +
	675	g_aiT16[TRANSFORM_INVERSE][10][k]src[10line] + g_aiT16[TRANSFORM_INVERSE][14][k]src[14line];
[56]	676	}
[1313]	677	EEO[0] = g_aiT16[TRANSFORM_INVERSE][4][0]src[ 4line ] + g_aiT16[TRANSFORM_INVERSE][12][0]src[ 12line ];
	678	EEE[0] = g_aiT16[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT16[TRANSFORM_INVERSE][ 8][0]src[ 8*line ];
	679	EEO[1] = g_aiT16[TRANSFORM_INVERSE][4][1]src[ 4line ] + g_aiT16[TRANSFORM_INVERSE][12][1]src[ 12line ];
	680	EEE[1] = g_aiT16[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT16[TRANSFORM_INVERSE][ 8][1]src[ 8*line ];
[56]	681
[1313]	682	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
[56]	683	for (k=0;k<2;k++)
	684	{
	685	EE[k] = EEE[k] + EEO[k];
	686	EE[k+2] = EEE[1-k] - EEO[1-k];
[1313]	687	}
[56]	688	for (k=0;k<4;k++)
	689	{
	690	E[k] = EE[k] + EO[k];
	691	E[k+4] = EE[3-k] - EO[3-k];
[1313]	692	}
[56]	693	for (k=0;k<8;k++)
	694	{
[1313]	695	dst[k] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
	696	dst[k+8] = Clip3( outputMinimum, outputMaximum, (E[7-k] - O[7-k] + add)>>shift );
	697	}
	698	src ++;
[56]	699	dst += 16;
	700	}
	701	}
	702
[1313]	703	/** 32x32 forward transform implemented using partial butterfly structure (1D)
	704	* \param src input data (residual)
	705	* \param dst output data (transform coefficients)
	706	* \param shift specifies right shift after 1D transform
	707	* \param line
	708	*/
	709	Void partialButterfly32(TCoeff src, TCoeff dst, Int shift, Int line)
[56]	710	{
[608]	711	Int j,k;
[1313]	712	TCoeff E[16],O[16];
	713	TCoeff EE[8],EO[8];
	714	TCoeff EEE[4],EEO[4];
	715	TCoeff EEEE[2],EEEO[2];
	716	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	717
	718	for (j=0; j<line; j++)
[1313]	719	{
[56]	720	/* E and O*/
	721	for (k=0;k<16;k++)
	722	{
	723	E[k] = src[k] + src[31-k];
	724	O[k] = src[k] - src[31-k];
[1313]	725	}
[56]	726	/* EE and EO */
	727	for (k=0;k<8;k++)
	728	{
	729	EE[k] = E[k] + E[15-k];
	730	EO[k] = E[k] - E[15-k];
	731	}
	732	/* EEE and EEO */
	733	for (k=0;k<4;k++)
	734	{
	735	EEE[k] = EE[k] + EE[7-k];
	736	EEO[k] = EE[k] - EE[7-k];
	737	}
	738	/* EEEE and EEEO */
[1313]	739	EEEE[0] = EEE[0] + EEE[3];
[56]	740	EEEO[0] = EEE[0] - EEE[3];
	741	EEEE[1] = EEE[1] + EEE[2];
	742	EEEO[1] = EEE[1] - EEE[2];
	743
[1313]	744	dst[ 0 ] = (g_aiT32[TRANSFORM_FORWARD][ 0][0]EEEE[0] + g_aiT32[TRANSFORM_FORWARD][ 0][1]EEEE[1] + add)>>shift;
	745	dst[ 16line ] = (g_aiT32[TRANSFORM_FORWARD][16][0]EEEE[0] + g_aiT32[TRANSFORM_FORWARD][16][1]*EEEE[1] + add)>>shift;
	746	dst[ 8line ] = (g_aiT32[TRANSFORM_FORWARD][ 8][0]EEEO[0] + g_aiT32[TRANSFORM_FORWARD][ 8][1]*EEEO[1] + add)>>shift;
	747	dst[ 24line ] = (g_aiT32[TRANSFORM_FORWARD][24][0]EEEO[0] + g_aiT32[TRANSFORM_FORWARD][24][1]*EEEO[1] + add)>>shift;
[56]	748	for (k=4;k<32;k+=8)
	749	{
[1313]	750	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][0]EEO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EEO[1] +
	751	g_aiT32[TRANSFORM_FORWARD][k][2]EEO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]EEO[3] + add)>>shift;
	752	}
[56]	753	for (k=2;k<32;k+=4)
	754	{
[1313]	755	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][0]EO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EO[1] +
	756	g_aiT32[TRANSFORM_FORWARD][k][2]EO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]EO[3] +
	757	g_aiT32[TRANSFORM_FORWARD][k][4]EO[4] + g_aiT32[TRANSFORM_FORWARD][k][5]EO[5] +
	758	g_aiT32[TRANSFORM_FORWARD][k][6]EO[6] + g_aiT32[TRANSFORM_FORWARD][k][7]EO[7] + add)>>shift;
	759	}
[56]	760	for (k=1;k<32;k+=2)
	761	{
[1313]	762	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][ 0]O[ 0] + g_aiT32[TRANSFORM_FORWARD][k][ 1]*O[ 1] +
	763	g_aiT32[TRANSFORM_FORWARD][k][ 2]O[ 2] + g_aiT32[TRANSFORM_FORWARD][k][ 3]O[ 3] +
	764	g_aiT32[TRANSFORM_FORWARD][k][ 4]O[ 4] + g_aiT32[TRANSFORM_FORWARD][k][ 5]O[ 5] +
	765	g_aiT32[TRANSFORM_FORWARD][k][ 6]O[ 6] + g_aiT32[TRANSFORM_FORWARD][k][ 7]O[ 7] +
	766	g_aiT32[TRANSFORM_FORWARD][k][ 8]O[ 8] + g_aiT32[TRANSFORM_FORWARD][k][ 9]O[ 9] +
	767	g_aiT32[TRANSFORM_FORWARD][k][10]O[10] + g_aiT32[TRANSFORM_FORWARD][k][11]O[11] +
	768	g_aiT32[TRANSFORM_FORWARD][k][12]O[12] + g_aiT32[TRANSFORM_FORWARD][k][13]O[13] +
	769	g_aiT32[TRANSFORM_FORWARD][k][14]O[14] + g_aiT32[TRANSFORM_FORWARD][k][15]O[15] + add)>>shift;
[56]	770	}
[1313]	771
[56]	772	src += 32;
	773	dst ++;
	774	}
	775	}
	776
[1313]	777	/** 32x32 inverse transform implemented using partial butterfly structure (1D)
	778	* \param src input data (transform coefficients)
	779	* \param dst output data (residual)
	780	* \param shift specifies right shift after 1D transform
	781	* \param line
	782	* \param outputMinimum minimum for clipping
	783	* \param outputMaximum maximum for clipping
	784	*/
	785	Void partialButterflyInverse32(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
[56]	786	{
[608]	787	Int j,k;
[1313]	788	TCoeff E[16],O[16];
	789	TCoeff EE[8],EO[8];
	790	TCoeff EEE[4],EEO[4];
	791	TCoeff EEEE[2],EEEO[2];
	792	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	793
	794	for (j=0; j<line; j++)
[1313]	795	{
[56]	796	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
	797	for (k=0;k<16;k++)
	798	{
[1313]	799	O[k] = g_aiT32[TRANSFORM_INVERSE][ 1][k]src[ line ] + g_aiT32[TRANSFORM_INVERSE][ 3][k]src[ 3*line ] +
	800	g_aiT32[TRANSFORM_INVERSE][ 5][k]src[ 5line ] + g_aiT32[TRANSFORM_INVERSE][ 7][k]src[ 7line ] +
	801	g_aiT32[TRANSFORM_INVERSE][ 9][k]src[ 9line ] + g_aiT32[TRANSFORM_INVERSE][11][k]src[ 11line ] +
	802	g_aiT32[TRANSFORM_INVERSE][13][k]src[ 13line ] + g_aiT32[TRANSFORM_INVERSE][15][k]src[ 15line ] +
	803	g_aiT32[TRANSFORM_INVERSE][17][k]src[ 17line ] + g_aiT32[TRANSFORM_INVERSE][19][k]src[ 19line ] +
	804	g_aiT32[TRANSFORM_INVERSE][21][k]src[ 21line ] + g_aiT32[TRANSFORM_INVERSE][23][k]src[ 23line ] +
	805	g_aiT32[TRANSFORM_INVERSE][25][k]src[ 25line ] + g_aiT32[TRANSFORM_INVERSE][27][k]src[ 27line ] +
	806	g_aiT32[TRANSFORM_INVERSE][29][k]src[ 29line ] + g_aiT32[TRANSFORM_INVERSE][31][k]src[ 31line ];
[56]	807	}
	808	for (k=0;k<8;k++)
	809	{
[1313]	810	EO[k] = g_aiT32[TRANSFORM_INVERSE][ 2][k]src[ 2line ] + g_aiT32[TRANSFORM_INVERSE][ 6][k]src[ 6line ] +
	811	g_aiT32[TRANSFORM_INVERSE][10][k]src[ 10line ] + g_aiT32[TRANSFORM_INVERSE][14][k]src[ 14line ] +
	812	g_aiT32[TRANSFORM_INVERSE][18][k]src[ 18line ] + g_aiT32[TRANSFORM_INVERSE][22][k]src[ 22line ] +
	813	g_aiT32[TRANSFORM_INVERSE][26][k]src[ 26line ] + g_aiT32[TRANSFORM_INVERSE][30][k]src[ 30line ];
[56]	814	}
	815	for (k=0;k<4;k++)
	816	{
[1313]	817	EEO[k] = g_aiT32[TRANSFORM_INVERSE][ 4][k]src[ 4line ] + g_aiT32[TRANSFORM_INVERSE][12][k]src[ 12line ] +
	818	g_aiT32[TRANSFORM_INVERSE][20][k]src[ 20line ] + g_aiT32[TRANSFORM_INVERSE][28][k]src[ 28line ];
[56]	819	}
[1313]	820	EEEO[0] = g_aiT32[TRANSFORM_INVERSE][8][0]src[ 8line ] + g_aiT32[TRANSFORM_INVERSE][24][0]src[ 24line ];
	821	EEEO[1] = g_aiT32[TRANSFORM_INVERSE][8][1]src[ 8line ] + g_aiT32[TRANSFORM_INVERSE][24][1]src[ 24line ];
	822	EEEE[0] = g_aiT32[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT32[TRANSFORM_INVERSE][16][0]src[ 16*line ];
	823	EEEE[1] = g_aiT32[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT32[TRANSFORM_INVERSE][16][1]src[ 16*line ];
[56]	824
	825	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
	826	EEE[0] = EEEE[0] + EEEO[0];
	827	EEE[3] = EEEE[0] - EEEO[0];
	828	EEE[1] = EEEE[1] + EEEO[1];
[1313]	829	EEE[2] = EEEE[1] - EEEO[1];
[56]	830	for (k=0;k<4;k++)
	831	{
	832	EE[k] = EEE[k] + EEO[k];
	833	EE[k+4] = EEE[3-k] - EEO[3-k];
[1313]	834	}
[56]	835	for (k=0;k<8;k++)
	836	{
	837	E[k] = EE[k] + EO[k];
	838	E[k+8] = EE[7-k] - EO[7-k];
[1313]	839	}
[56]	840	for (k=0;k<16;k++)
	841	{
[1313]	842	dst[k] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
	843	dst[k+16] = Clip3( outputMinimum, outputMaximum, (E[15-k] - O[15-k] + add)>>shift );
[56]	844	}
	845	src ++;
	846	dst += 32;
	847	}
	848	}
	849
	850	/** MxN forward transform (2D)
[1313]	851	* \param bitDepth [in] bit depth
	852	* \param block [in] residual block
	853	* \param coeff [out] transform coefficients
	854	* \param iWidth [in] width of transform
	855	* \param iHeight [in] height of transform
	856	* \param useDST [in]
	857	* \param maxLog2TrDynamicRange [in]
	858
[56]	859	*/
[1313]	860	Void xTrMxN(Int bitDepth, TCoeff block, TCoeff coeff, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
[2]	861	{
[1313]	862	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
[2]	863
[1313]	864	const Int shift_1st = ((g_aucConvertToBit[iWidth] + 2) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
	865	const Int shift_2nd = (g_aucConvertToBit[iHeight] + 2) + TRANSFORM_MATRIX_SHIFT;
[2]	866
[1313]	867	assert(shift_1st >= 0);
	868	assert(shift_2nd >= 0);
	869
	870	TCoeff tmp[ MAX_TU_SIZE * MAX_TU_SIZE ];
	871
	872	switch (iWidth)
[2]	873	{
[1313]	874	case 4:
	875	{
	876	if ((iHeight == 4) && useDST) // Check for DCT or DST
	877	{
	878	fastForwardDst( block, tmp, shift_1st );
	879	}
	880	else
	881	{
	882	partialButterfly4 ( block, tmp, shift_1st, iHeight );
	883	}
	884	}
	885	break;
[608]	886
[1313]	887	case 8: partialButterfly8 ( block, tmp, shift_1st, iHeight ); break;
	888	case 16: partialButterfly16( block, tmp, shift_1st, iHeight ); break;
	889	case 32: partialButterfly32( block, tmp, shift_1st, iHeight ); break;
	890	default:
	891	assert(0); exit (1); break;
[2]	892	}
[1313]	893
	894	switch (iHeight)
[2]	895	{
[1313]	896	case 4:
	897	{
	898	if ((iWidth == 4) && useDST) // Check for DCT or DST
	899	{
	900	fastForwardDst( tmp, coeff, shift_2nd );
	901	}
	902	else
	903	{
	904	partialButterfly4 ( tmp, coeff, shift_2nd, iWidth );
	905	}
	906	}
	907	break;
	908
	909	case 8: partialButterfly8 ( tmp, coeff, shift_2nd, iWidth ); break;
	910	case 16: partialButterfly16( tmp, coeff, shift_2nd, iWidth ); break;
	911	case 32: partialButterfly32( tmp, coeff, shift_2nd, iWidth ); break;
	912	default:
	913	assert(0); exit (1); break;
[2]	914	}
[56]	915	}
[1313]	916
	917
[56]	918	/** MxN inverse transform (2D)
[1313]	919	* \param bitDepth [in] bit depth
	920	* \param coeff [in] transform coefficients
	921	* \param block [out] residual block
	922	* \param iWidth [in] width of transform
	923	* \param iHeight [in] height of transform
	924	* \param useDST [in]
	925	* \param maxLog2TrDynamicRange [in]
[56]	926	*/
[1313]	927	Void xITrMxN(Int bitDepth, TCoeff coeff, TCoeff block, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
[56]	928	{
[1313]	929	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
[2]	930
[1313]	931	Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
	932	Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
	933	const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
	934	const TCoeff clipMaximum = (1 << maxLog2TrDynamicRange) - 1;
	935
	936	assert(shift_1st >= 0);
	937	assert(shift_2nd >= 0);
	938
	939	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
	940
	941	switch (iHeight)
[56]	942	{
[1313]	943	case 4:
	944	{
	945	if ((iWidth == 4) && useDST) // Check for DCT or DST
	946	{
	947	fastInverseDst( coeff, tmp, shift_1st, clipMinimum, clipMaximum);
	948	}
	949	else
	950	{
	951	partialButterflyInverse4 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum);
	952	}
	953	}
	954	break;
	955
	956	case 8: partialButterflyInverse8 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
	957	case 16: partialButterflyInverse16( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
	958	case 32: partialButterflyInverse32( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
	959
	960	default:
	961	assert(0); exit (1); break;
[2]	962	}
[1313]	963
	964	switch (iWidth)
[2]	965	{
[1313]	966	// Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
	967	case 4:
	968	{
	969	if ((iHeight == 4) && useDST) // Check for DCT or DST
	970	{
	971	fastInverseDst( tmp, block, shift_2nd, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max() );
	972	}
	973	else
	974	{
	975	partialButterflyInverse4 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max());
	976	}
	977	}
	978	break;
	979
	980	case 8: partialButterflyInverse8 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
	981	case 16: partialButterflyInverse16( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
	982	case 32: partialButterflyInverse32( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
	983
	984	default:
	985	assert(0); exit (1); break;
[2]	986	}
[56]	987	}
[2]	988
	989
[1313]	990	// To minimize the distortion only. No rate is considered.
	991	Void TComTrQuant::signBitHidingHDQ( TCoeff* pQCoef, TCoeff* pCoef, TCoeff* deltaU, const TUEntropyCodingParameters &codingParameters, const Int maxLog2TrDynamicRange )
[56]	992	{
[1313]	993	const UInt width = codingParameters.widthInGroups << MLS_CG_LOG2_WIDTH;
	994	const UInt height = codingParameters.heightInGroups << MLS_CG_LOG2_HEIGHT;
	995	const UInt groupSize = 1 << MLS_CG_SIZE;
	996
	997	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
	998	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
	999
[56]	1000	Int lastCG = -1;
	1001	Int absSum = 0 ;
	1002	Int n ;
[2]	1003
[1313]	1004	for( Int subSet = (width*height-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
[56]	1005	{
[1313]	1006	Int subPos = subSet << MLS_CG_SIZE;
	1007	Int firstNZPosInCG=groupSize , lastNZPosInCG=-1 ;
[56]	1008	absSum = 0 ;
[2]	1009
[1313]	1010	for(n = groupSize-1; n >= 0; --n )
[56]	1011	{
[1313]	1012	if( pQCoef[ codingParameters.scan[ n + subPos ]] )
[56]	1013	{
	1014	lastNZPosInCG = n;
	1015	break;
	1016	}
	1017	}
[2]	1018
[1313]	1019	for(n = 0; n <groupSize; n++ )
[56]	1020	{
[1313]	1021	if( pQCoef[ codingParameters.scan[ n + subPos ]] )
[56]	1022	{
	1023	firstNZPosInCG = n;
	1024	break;
	1025	}
	1026	}
[2]	1027
[56]	1028	for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
	1029	{
[1313]	1030	absSum += Int(pQCoef[ codingParameters.scan[ n + subPos ]]);
[56]	1031	}
[2]	1032
[1313]	1033	if(lastNZPosInCG>=0 && lastCG==-1)
[56]	1034	{
[1313]	1035	lastCG = 1 ;
[56]	1036	}
[608]	1037
	1038	if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
[56]	1039	{
[1313]	1040	UInt signbit = (pQCoef[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1) ;
[56]	1041	if( signbit!=(absSum&0x1) ) //compare signbit with sum_parity
	1042	{
[1313]	1043	TCoeff curCost = std::numeric_limits<TCoeff>::max();
	1044	TCoeff minCostInc = std::numeric_limits<TCoeff>::max();
	1045	Int minPos =-1, finalChange=0, curChange=0;
	1046
	1047	for( n = (lastCG==1?lastNZPosInCG:groupSize-1) ; n >= 0; --n )
[2]	1048	{
[1313]	1049	UInt blkPos = codingParameters.scan[ n+subPos ];
[56]	1050	if(pQCoef[ blkPos ] != 0 )
[2]	1051	{
[56]	1052	if(deltaU[blkPos]>0)
[2]	1053	{
[1313]	1054	curCost = - deltaU[blkPos];
[56]	1055	curChange=1 ;
[2]	1056	}
[1313]	1057	else
[2]	1058	{
[56]	1059	//curChange =-1;
	1060	if(n==firstNZPosInCG && abs(pQCoef[blkPos])==1)
	1061	{
[1313]	1062	curCost = std::numeric_limits<TCoeff>::max();
[56]	1063	}
	1064	else
	1065	{
[1313]	1066	curCost = deltaU[blkPos];
[56]	1067	curChange =-1;
	1068	}
[2]	1069	}
	1070	}
	1071	else
	1072	{
[56]	1073	if(n<firstNZPosInCG)
	1074	{
	1075	UInt thisSignBit = (pCoef[blkPos]>=0?0:1);
	1076	if(thisSignBit != signbit )
	1077	{
[1313]	1078	curCost = std::numeric_limits<TCoeff>::max();
[2]	1079	}
[56]	1080	else
[1313]	1081	{
[56]	1082	curCost = - (deltaU[blkPos]) ;
	1083	curChange = 1 ;
[2]	1084	}
	1085	}
	1086	else
	1087	{
[56]	1088	curCost = - (deltaU[blkPos]) ;
	1089	curChange = 1 ;
[2]	1090	}
	1091	}
[56]	1092
	1093	if( curCost<minCostInc)
[2]	1094	{
[56]	1095	minCostInc = curCost ;
	1096	finalChange = curChange ;
	1097	minPos = blkPos ;
[2]	1098	}
[56]	1099	} //CG loop
[2]	1100
[1313]	1101	if(pQCoef[minPos] == entropyCodingMaximum \|\| pQCoef[minPos] == entropyCodingMinimum)
[56]	1102	{
	1103	finalChange = -1;
[2]	1104	}
	1105
[56]	1106	if(pCoef[minPos]>=0)
[2]	1107	{
[1313]	1108	pQCoef[minPos] += finalChange ;
[2]	1109	}
[1313]	1110	else
	1111	{
[56]	1112	pQCoef[minPos] -= finalChange ;
[1313]	1113	}
[56]	1114	} // Hide
	1115	}
[1313]	1116	if(lastCG==1)
[56]	1117	{
	1118	lastCG=0 ;
	1119	}
	1120	} // TU loop
	1121
	1122	return;
	1123	}
	1124
[1313]	1125
	1126	Void TComTrQuant::xQuant( TComTU &rTu,
	1127	TCoeff * pSrc,
	1128	TCoeff * pDes,
[56]	1129	#if ADAPTIVE_QP_SELECTION
[1313]	1130	TCoeff *pArlDes,
[56]	1131	#endif
[1313]	1132	TCoeff &uiAbsSum,
	1133	const ComponentID compID,
	1134	const QpParam &cQP )
[2]	1135	{
[1313]	1136	const TComRectangle &rect = rTu.getRect(compID);
	1137	const UInt uiWidth = rect.width;
	1138	const UInt uiHeight = rect.height;
	1139	TComDataCU* pcCU = rTu.getCU();
	1140	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1141	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1142
	1143	TCoeff* piCoef = pSrc;
[56]	1144	TCoeff* piQCoef = pDes;
	1145	#if ADAPTIVE_QP_SELECTION
[1313]	1146	TCoeff* piArlCCoef = pArlDes;
[56]	1147	#endif
[1313]	1148
	1149	const Bool useTransformSkip = pcCU->getTransformSkip(uiAbsPartIdx, compID);
	1150	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	1151
	1152	Bool useRDOQ = useTransformSkip ? m_useRDOQTS : m_useRDOQ;
	1153	if ( useRDOQ && (isLuma(compID) \|\| RDOQ_CHROMA) )
[2]	1154	{
[1313]	1155	if ( !m_useSelectiveRDOQ \|\| xNeedRDOQ( rTu, piCoef, compID, cQP ) )
	1156	{
[56]	1157	#if ADAPTIVE_QP_SELECTION
[1313]	1158	xRateDistOptQuant( rTu, piCoef, pDes, pArlDes, uiAbsSum, compID, cQP );
[2]	1159	#else
[1313]	1160	xRateDistOptQuant( rTu, piCoef, pDes, uiAbsSum, compID, cQP );
[2]	1161	#endif
[1313]	1162	}
	1163	else
	1164	{
	1165	memset( pDes, 0, sizeof( TCoeff ) * uiWidth *uiHeight );
	1166	uiAbsSum = 0;
	1167	}
[2]	1168	}
	1169	else
	1170	{
[1313]	1171	TUEntropyCodingParameters codingParameters;
	1172	getTUEntropyCodingParameters(codingParameters, rTu, compID);
[56]	1173
[1313]	1174	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
	1175	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
[56]	1176
[1313]	1177	TCoeff deltaU[MAX_TU_SIZE * MAX_TU_SIZE];
[56]	1178
[1313]	1179	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
[56]	1180
[1313]	1181	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	1182	assert(scalingListType < SCALING_LIST_NUM);
	1183	Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
	1184
	1185	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
	1186	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
	1187
	1188	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
	1189	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
	1190	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
	1191	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
	1192	*/
	1193
	1194	// Represents scaling through forward transform
	1195	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
	1196	if (useTransformSkip && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
[2]	1197	{
[1313]	1198	iTransformShift = std::max<Int>(0, iTransformShift);
[2]	1199	}
[1313]	1200
	1201	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
	1202	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
	1203
	1204	#if ADAPTIVE_QP_SELECTION
	1205	Int iQBitsC = MAX_INT;
	1206	Int iAddC = MAX_INT;
	1207
	1208	if (m_bUseAdaptQpSelect)
[56]	1209	{
[1313]	1210	iQBitsC = iQBits - ARL_C_PRECISION;
	1211	iAddC = 1 << (iQBitsC-1);
[2]	1212	}
	1213	#endif
	1214
[1313]	1215	const Int iAdd = (pcCU->getSlice()->getSliceType()==I_SLICE ? 171 : 85) << (iQBits-9);
	1216	const Int qBits8 = iQBits - 8;
[2]	1217
[1313]	1218	for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
	1219	{
	1220	const TCoeff iLevel = piCoef[uiBlockPos];
	1221	const TCoeff iSign = (iLevel < 0 ? -1: 1);
[2]	1222
[1313]	1223	const Int64 tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
[56]	1224
	1225	#if ADAPTIVE_QP_SELECTION
	1226	if( m_bUseAdaptQpSelect )
[2]	1227	{
[1313]	1228	piArlCCoef[uiBlockPos] = (TCoeff)((tmpLevel + iAddC ) >> iQBitsC);
[2]	1229	}
	1230	#endif
[1313]	1231
	1232	const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
	1233	deltaU[uiBlockPos] = (TCoeff)((tmpLevel - (quantisedMagnitude<<iQBits) )>> qBits8);
	1234
	1235	uiAbsSum += quantisedMagnitude;
	1236	const TCoeff quantisedCoefficient = quantisedMagnitude * iSign;
	1237
	1238	piQCoef[uiBlockPos] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
[2]	1239	} // for n
[1313]	1240
[1413]	1241	if( pcCU->getSlice()->getPPS()->getSignDataHidingEnabledFlag() )
[2]	1242	{
[1313]	1243	if(uiAbsSum >= 2) //this prevents TUs with only one coefficient of value 1 from being tested
[2]	1244	{
[1313]	1245	signBitHidingHDQ( piQCoef, piCoef, deltaU, codingParameters, maxLog2TrDynamicRange ) ;
[2]	1246	}
	1247	}
[56]	1248	} //if RDOQ
	1249	//return;
[2]	1250	}
	1251
[1313]	1252	Bool TComTrQuant::xNeedRDOQ( TComTU &rTu, TCoeff * pSrc, const ComponentID compID, const QpParam &cQP )
[2]	1253	{
[1313]	1254	const TComRectangle &rect = rTu.getRect(compID);
	1255	const UInt uiWidth = rect.width;
	1256	const UInt uiHeight = rect.height;
	1257	TComDataCU* pcCU = rTu.getCU();
	1258	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1259	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1260
	1261	TCoeff* piCoef = pSrc;
	1262
	1263	const Bool useTransformSkip = pcCU->getTransformSkip(uiAbsPartIdx, compID);
	1264	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	1265
	1266	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
	1267
	1268	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	1269	assert(scalingListType < SCALING_LIST_NUM);
	1270	Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
	1271
	1272	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
	1273	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
	1274
	1275	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
	1276	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
	1277	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
	1278	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
	1279	*/
	1280
	1281	// Represents scaling through forward transform
	1282	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
	1283	if (useTransformSkip && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
[2]	1284	{
[1313]	1285	iTransformShift = std::max<Int>(0, iTransformShift);
[2]	1286	}
	1287
[1313]	1288	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
	1289	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
[608]	1290
[1313]	1291	// iAdd is different from the iAdd used in normal quantization
	1292	const Int iAdd = (compID == COMPONENT_Y ? 171 : 256) << (iQBits-9);
[2]	1293
[1313]	1294	for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
	1295	{
	1296	const TCoeff iLevel = piCoef[uiBlockPos];
	1297	const Int64 tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
	1298	const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
[56]	1299
[1313]	1300	if ( quantisedMagnitude != 0 )
	1301	{
	1302	return true;
	1303	}
	1304	} // for n
	1305	return false;
	1306	}
	1307
	1308	Void TComTrQuant::xDeQuant( TComTU &rTu,
	1309	const TCoeff * pSrc,
	1310	TCoeff * pDes,
	1311	const ComponentID compID,
	1312	const QpParam &cQP )
	1313	{
	1314	assert(compID<MAX_NUM_COMPONENT);
	1315
	1316	TComDataCU *pcCU = rTu.getCU();
	1317	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1318	const TComRectangle &rect = rTu.getRect(compID);
	1319	const UInt uiWidth = rect.width;
	1320	const UInt uiHeight = rect.height;
	1321	const TCoeff *const piQCoef = pSrc;
	1322	TCoeff *const piCoef = pDes;
	1323	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
	1324	const UInt numSamplesInBlock = uiWidth*uiHeight;
	1325	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	1326	const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
	1327	const TCoeff transformMaximum = (1 << maxLog2TrDynamicRange) - 1;
	1328	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
	1329	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	1330	#if O0043_BEST_EFFORT_DECODING
	1331	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
	1332	#else
	1333	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1334	#endif
	1335
	1336	assert (scalingListType < SCALING_LIST_NUM);
	1337	assert ( uiWidth <= m_uiMaxTrSize );
	1338
	1339	// Represents scaling through forward transform
	1340	const Bool bClipTransformShiftTo0 = (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
	1341	const Int originalTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
	1342	const Int iTransformShift = bClipTransformShiftTo0 ? std::max<Int>(0, originalTransformShift) : originalTransformShift;
	1343
	1344	const Int QP_per = cQP.per;
	1345	const Int QP_rem = cQP.rem;
	1346
	1347	const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
	1348
	1349	if(enableScalingLists)
[2]	1350	{
[1313]	1351	//from the dequantisation equation:
	1352	//iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[deQuantIdx]) + iAdd ) >> rightShift
	1353	//(sizeof(Intermediate_Int) * 8) = inputBitDepth + dequantCoefBits - rightShift
	1354	const UInt dequantCoefBits = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
	1355	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
[608]	1356
[1313]	1357	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
	1358	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
	1359
	1360	Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
	1361
	1362	if(rightShift > 0)
[2]	1363	{
[1313]	1364	const Intermediate_Int iAdd = 1 << (rightShift - 1);
	1365
	1366	for( Int n = 0; n < numSamplesInBlock; n++ )
[56]	1367	{
[1313]	1368	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
	1369	const Intermediate_Int iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[n]) + iAdd ) >> rightShift;
	1370
	1371	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
[56]	1372	}
[2]	1373	}
	1374	else
	1375	{
[1313]	1376	const Int leftShift = -rightShift;
	1377
	1378	for( Int n = 0; n < numSamplesInBlock; n++ )
[56]	1379	{
[1313]	1380	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
	1381	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * piDequantCoef[n]) << leftShift;
	1382
	1383	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
[56]	1384	}
	1385	}
[2]	1386	}
[56]	1387	else
[2]	1388	{
[1313]	1389	const Int scale = g_invQuantScales[QP_rem];
	1390	const Int scaleBits = (IQUANT_SHIFT + 1) ;
[2]	1391
[1313]	1392	//from the dequantisation equation:
	1393	//iCoeffQ = Intermediate_Int((Int64(clipQCoef) * scale + iAdd) >> rightShift);
	1394	//(sizeof(Intermediate_Int) * 8) = inputBitDepth + scaleBits - rightShift
	1395	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
	1396	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
	1397	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
	1398
	1399	if (rightShift > 0)
[56]	1400	{
[1313]	1401	const Intermediate_Int iAdd = 1 << (rightShift - 1);
	1402
	1403	for( Int n = 0; n < numSamplesInBlock; n++ )
	1404	{
	1405	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
	1406	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
	1407
	1408	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	1409	}
[56]	1410	}
[1313]	1411	else
	1412	{
	1413	const Int leftShift = -rightShift;
	1414
	1415	for( Int n = 0; n < numSamplesInBlock; n++ )
	1416	{
	1417	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
	1418	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale) << leftShift;
	1419
	1420	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	1421	}
	1422	}
[2]	1423	}
	1424	}
[56]	1425
[1313]	1426
	1427	Void TComTrQuant::init( UInt uiMaxTrSize,
	1428	Bool bUseRDOQ,
	1429	Bool bUseRDOQTS,
	1430	Bool useSelectiveRDOQ,
	1431	Bool bEnc,
	1432	Bool useTransformSkipFast
[56]	1433	#if ADAPTIVE_QP_SELECTION
[1313]	1434	, Bool bUseAdaptQpSelect
[2]	1435	#endif
[56]	1436	)
[2]	1437	{
	1438	m_uiMaxTrSize = uiMaxTrSize;
	1439	m_bEnc = bEnc;
[1313]	1440	m_useRDOQ = bUseRDOQ;
	1441	m_useRDOQTS = bUseRDOQTS;
	1442	m_useSelectiveRDOQ = useSelectiveRDOQ;
[56]	1443	#if ADAPTIVE_QP_SELECTION
	1444	m_bUseAdaptQpSelect = bUseAdaptQpSelect;
[2]	1445	#endif
[608]	1446	m_useTransformSkipFast = useTransformSkipFast;
[2]	1447	}
	1448
[1313]	1449
	1450	Void TComTrQuant::transformNxN( TComTU & rTu,
	1451	const ComponentID compID,
	1452	Pel * pcResidual,
	1453	const UInt uiStride,
	1454	TCoeff * rpcCoeff,
[56]	1455	#if ADAPTIVE_QP_SELECTION
[1313]	1456	TCoeff * pcArlCoeff,
[2]	1457	#endif
[1313]	1458	TCoeff & uiAbsSum,
	1459	const QpParam & cQP
	1460	)
[2]	1461	{
[1313]	1462	const TComRectangle &rect = rTu.getRect(compID);
	1463	const UInt uiWidth = rect.width;
	1464	const UInt uiHeight = rect.height;
	1465	TComDataCU* pcCU = rTu.getCU();
	1466	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1467	const UInt uiOrgTrDepth = rTu.GetTransformDepthRel();
	1468
	1469	uiAbsSum=0;
	1470
	1471	RDPCMMode rdpcmMode = RDPCM_OFF;
	1472	rdpcmNxN( rTu, compID, pcResidual, uiStride, cQP, rpcCoeff, uiAbsSum, rdpcmMode );
	1473
	1474	if (rdpcmMode == RDPCM_OFF)
[2]	1475	{
[1313]	1476	uiAbsSum = 0;
	1477	//transform and quantise
	1478	if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
[2]	1479	{
[1313]	1480	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
	1481	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
	1482
	1483	for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
[2]	1484	{
[1313]	1485	for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
	1486	{
	1487	const Pel currentSample = pcResidual[(y * uiStride) + x];
	1488
	1489	rpcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = currentSample;
	1490	uiAbsSum += TCoeff(abs(currentSample));
	1491	}
[2]	1492	}
	1493	}
[1313]	1494	else
	1495	{
	1496	#if DEBUG_TRANSFORM_AND_QUANTISE
	1497	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to transform\n";
	1498	printBlock(pcResidual, uiWidth, uiHeight, uiStride);
	1499	#endif
	1500
	1501	assert( (pcCU->getSlice()->getSPS()->getMaxTrSize() >= uiWidth) );
	1502
	1503	if(pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0)
	1504	{
	1505	xTransformSkip( pcResidual, uiStride, m_plTempCoeff, rTu, compID );
	1506	}
	1507	else
	1508	{
	1509	const Int channelBitDepth=pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1510	xT( channelBitDepth, rTu.useDST(compID), pcResidual, uiStride, m_plTempCoeff, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
	1511	}
	1512
	1513	#if DEBUG_TRANSFORM_AND_QUANTISE
	1514	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between transform and quantiser\n";
	1515	printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
	1516	#endif
	1517
	1518	xQuant( rTu, m_plTempCoeff, rpcCoeff,
	1519
	1520	#if ADAPTIVE_QP_SELECTION
	1521	pcArlCoeff,
	1522	#endif
	1523	uiAbsSum, compID, cQP );
	1524
	1525	#if DEBUG_TRANSFORM_AND_QUANTISE
	1526	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of quantiser\n";
	1527	printBlock(rpcCoeff, uiWidth, uiHeight, uiWidth);
	1528	#endif
	1529	}
[2]	1530	}
[1313]	1531
	1532	//set the CBF
	1533	pcCU->setCbfPartRange((((uiAbsSum > 0) ? 1 : 0) << uiOrgTrDepth), compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
	1534	}
	1535
	1536
	1537	Void TComTrQuant::invTransformNxN( TComTU &rTu,
	1538	const ComponentID compID,
	1539	Pel *pcResidual,
	1540	const UInt uiStride,
	1541	TCoeff * pcCoeff,
	1542	const QpParam &cQP
	1543	DEBUG_STRING_FN_DECLAREP(psDebug))
	1544	{
	1545	TComDataCU* pcCU=rTu.getCU();
	1546	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1547	const TComRectangle &rect = rTu.getRect(compID);
	1548	const UInt uiWidth = rect.width;
	1549	const UInt uiHeight = rect.height;
	1550
	1551	if (uiWidth != uiHeight) //for intra, the TU will have been split above this level, so this condition won't be true, hence this only affects inter
[2]	1552	{
[1313]	1553	//------------------------------------------------
	1554
	1555	//recurse deeper
	1556
	1557	TComTURecurse subTURecurse(rTu, false, TComTU::VERTICAL_SPLIT, true, compID);
	1558
	1559	do
	1560	{
	1561	//------------------
	1562
	1563	const UInt lineOffset = subTURecurse.GetSectionNumber() * subTURecurse.getRect(compID).height;
	1564
	1565	Pel subTUResidual = pcResidual + (lineOffset uiStride);
	1566	TCoeff subTUCoefficients = pcCoeff + (lineOffset subTURecurse.getRect(compID).width);
	1567
	1568	invTransformNxN(subTURecurse, compID, subTUResidual, uiStride, subTUCoefficients, cQP DEBUG_STRING_PASS_INTO(psDebug));
	1569
	1570	//------------------
	1571
	1572	} while (subTURecurse.nextSection(rTu));
	1573
	1574	//------------------------------------------------
	1575
	1576	return;
[2]	1577	}
[1313]	1578
	1579	#if DEBUG_STRING
	1580	if (psDebug)
[2]	1581	{
[1313]	1582	std::stringstream ss(stringstream::out);
	1583	printBlockToStream(ss, (compID==0)?"###InvTran ip Ch0: " : ((compID==1)?"###InvTran ip Ch1: ":"###InvTran ip Ch2: "), pcCoeff, uiWidth, uiHeight, uiWidth);
	1584	DEBUG_STRING_APPEND((*psDebug), ss.str())
[2]	1585	}
[1313]	1586	#endif
	1587
	1588	if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
[608]	1589	{
[1313]	1590	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
	1591	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
	1592
	1593	for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
	1594	{
	1595	for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
	1596	{
	1597	pcResidual[(y * uiStride) + x] = Pel(pcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex]);
	1598	}
	1599	}
[608]	1600	}
	1601	else
	1602	{
[1313]	1603	#if DEBUG_TRANSFORM_AND_QUANTISE
	1604	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to dequantiser\n";
	1605	printBlock(pcCoeff, uiWidth, uiHeight, uiWidth);
	1606	#endif
	1607
	1608	xDeQuant(rTu, pcCoeff, m_plTempCoeff, compID, cQP);
	1609
	1610	#if DEBUG_TRANSFORM_AND_QUANTISE
	1611	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between dequantiser and inverse-transform\n";
	1612	printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
	1613	#endif
	1614
	1615	#if DEBUG_STRING
	1616	if (psDebug)
	1617	{
	1618	std::stringstream ss(stringstream::out);
	1619	printBlockToStream(ss, "###InvTran deq: ", m_plTempCoeff, uiWidth, uiHeight, uiWidth);
	1620	(*psDebug)+=ss.str();
	1621	}
	1622	#endif
	1623
	1624	if(pcCU->getTransformSkip(uiAbsPartIdx, compID))
	1625	{
	1626	xITransformSkip( m_plTempCoeff, pcResidual, uiStride, rTu, compID );
	1627
	1628	#if DEBUG_STRING
	1629	if (psDebug)
	1630	{
	1631	std::stringstream ss(stringstream::out);
	1632	printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
	1633	(*psDebug)+=ss.str();
	1634	(*psDebug)+="(<- was a Transform-skipped block)\n";
	1635	}
	1636	#endif
	1637	}
	1638	else
	1639	{
	1640	#if O0043_BEST_EFFORT_DECODING
	1641	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
	1642	#else
	1643	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1644	#endif
	1645	xIT( channelBitDepth, rTu.useDST(compID), m_plTempCoeff, pcResidual, uiStride, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
	1646
	1647	#if DEBUG_STRING
	1648	if (psDebug)
	1649	{
	1650	std::stringstream ss(stringstream::out);
	1651	printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
	1652	(*psDebug)+=ss.str();
	1653	(*psDebug)+="(<- was a Transformed block)\n";
	1654	}
	1655	#endif
	1656	}
	1657
	1658	#if DEBUG_TRANSFORM_AND_QUANTISE
	1659	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of inverse-transform\n";
	1660	printBlock(pcResidual, uiWidth, uiHeight, uiStride);
	1661	g_debugCounter++;
	1662	#endif
[608]	1663	}
[1313]	1664
	1665	invRdpcmNxN( rTu, compID, pcResidual, uiStride );
[2]	1666	}
	1667
[1313]	1668	Void TComTrQuant::invRecurTransformNxN( const ComponentID compID,
	1669	TComYuv *pResidual,
	1670	TComTU &rTu)
[2]	1671	{
[1313]	1672	if (!rTu.ProcessComponentSection(compID))
[2]	1673	{
[1313]	1674	return;
	1675	}
	1676
	1677	TComDataCU* pcCU = rTu.getCU();
	1678	UInt absPartIdxTU = rTu.GetAbsPartIdxTU();
	1679	UInt uiTrMode=rTu.GetTransformDepthRel();
	1680	if( (pcCU->getCbf(absPartIdxTU, compID, uiTrMode) == 0) && (isLuma(compID) \|\| !pcCU->getSlice()->getPPS()->getPpsRangeExtension().getCrossComponentPredictionEnabledFlag()) )
	1681	{
	1682	return;
	1683	}
	1684
	1685	if( uiTrMode == pcCU->getTransformIdx( absPartIdxTU ) )
	1686	{
	1687	const TComRectangle &tuRect = rTu.getRect(compID);
	1688	const Int uiStride = pResidual->getStride( compID );
	1689	Pel *rpcResidual = pResidual->getAddr( compID );
	1690	UInt uiAddr = (tuRect.x0 + uiStride*tuRect.y0);
	1691	Pel *pResi = rpcResidual + uiAddr;
	1692	TCoeff *pcCoeff = pcCU->getCoeff(compID) + rTu.getCoefficientOffset(compID);
	1693
	1694	const QpParam cQP(*pcCU, compID);
	1695
	1696	if(pcCU->getCbf(absPartIdxTU, compID, uiTrMode) != 0)
[2]	1697	{
[1313]	1698	DEBUG_STRING_NEW(sTemp)
	1699	#if DEBUG_STRING
	1700	std::string *psDebug=((DebugOptionList::DebugString_InvTran.getInt()&(pcCU->isIntra(absPartIdxTU)?1:(pcCU->isInter(absPartIdxTU)?2:4)))!=0) ? &sTemp : 0;
	1701	#endif
	1702
	1703	invTransformNxN( rTu, compID, pResi, uiStride, pcCoeff, cQP DEBUG_STRING_PASS_INTO(psDebug) );
	1704
	1705	#if DEBUG_STRING
	1706	if (psDebug != 0)
[56]	1707	{
[1313]	1708	std::cout << (*psDebug);
[56]	1709	}
[1313]	1710	#endif
	1711	}
	1712
	1713	if (isChroma(compID) && (pcCU->getCrossComponentPredictionAlpha(absPartIdxTU, compID) != 0))
	1714	{
	1715	const Pel *piResiLuma = pResidual->getAddr( COMPONENT_Y );
	1716	const Int strideLuma = pResidual->getStride( COMPONENT_Y );
	1717	const Int tuWidth = rTu.getRect( compID ).width;
	1718	const Int tuHeight = rTu.getRect( compID ).height;
	1719
	1720	if(pcCU->getCbf(absPartIdxTU, COMPONENT_Y, uiTrMode) != 0)
	1721	{
	1722	pResi = rpcResidual + uiAddr;
	1723	const Pel *pResiLuma = piResiLuma + uiAddr;
	1724
	1725	crossComponentPrediction( rTu, compID, pResiLuma, pResi, pResi, tuWidth, tuHeight, strideLuma, uiStride, uiStride, true );
	1726	}
	1727	}
[2]	1728	}
[1313]	1729	else
[608]	1730	{
[1313]	1731	TComTURecurse tuRecurseChild(rTu, false);
	1732	do
	1733	{
	1734	invRecurTransformNxN( compID, pResidual, tuRecurseChild );
	1735	} while (tuRecurseChild.nextSection(rTu));
[608]	1736	}
[1313]	1737	}
	1738
	1739	Void TComTrQuant::applyForwardRDPCM( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, const RDPCMMode mode )
	1740	{
	1741	TComDataCU *pcCU=rTu.getCU();
	1742	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
	1743
	1744	const Bool bLossless = pcCU->getCUTransquantBypass( uiAbsPartIdx );
	1745	const UInt uiWidth = rTu.getRect(compID).width;
	1746	const UInt uiHeight = rTu.getRect(compID).height;
	1747	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
	1748	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
	1749
	1750	UInt uiX = 0;
	1751	UInt uiY = 0;
	1752
	1753	UInt &majorAxis = (mode == RDPCM_VER) ? uiX : uiY;
	1754	UInt &minorAxis = (mode == RDPCM_VER) ? uiY : uiX;
	1755	const UInt majorAxisLimit = (mode == RDPCM_VER) ? uiWidth : uiHeight;
	1756	const UInt minorAxisLimit = (mode == RDPCM_VER) ? uiHeight : uiWidth;
	1757
	1758	const Bool bUseHalfRoundingPoint = (mode != RDPCM_OFF);
	1759
	1760	uiAbsSum = 0;
	1761
	1762	for ( majorAxis = 0; majorAxis < majorAxisLimit; majorAxis++ )
[608]	1763	{
[1313]	1764	TCoeff accumulatorValue = 0; // 32-bit accumulator
	1765	for ( minorAxis = 0; minorAxis < minorAxisLimit; minorAxis++ )
	1766	{
	1767	const UInt sampleIndex = (uiY * uiWidth) + uiX;
	1768	const UInt coefficientIndex = (rotateResidual ? (uiSizeMinus1-sampleIndex) : sampleIndex);
	1769	const Pel currentSample = pcResidual[(uiY * uiStride) + uiX];
	1770	const TCoeff encoderSideDelta = TCoeff(currentSample) - accumulatorValue;
	1771
	1772	Pel reconstructedDelta;
	1773	if ( bLossless )
	1774	{
	1775	pcCoeff[coefficientIndex] = encoderSideDelta;
	1776	reconstructedDelta = (Pel) encoderSideDelta;
	1777	}
	1778	else
	1779	{
	1780	transformSkipQuantOneSample(rTu, compID, encoderSideDelta, pcCoeff, coefficientIndex, cQP, bUseHalfRoundingPoint);
	1781	invTrSkipDeQuantOneSample (rTu, compID, pcCoeff[coefficientIndex], reconstructedDelta, cQP, coefficientIndex);
	1782	}
	1783
	1784	uiAbsSum += abs(pcCoeff[coefficientIndex]);
	1785
	1786	if (mode != RDPCM_OFF)
	1787	{
	1788	accumulatorValue += reconstructedDelta;
	1789	}
	1790	}
[608]	1791	}
[2]	1792	}
	1793
[1313]	1794	Void TComTrQuant::rdpcmNxN ( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, RDPCMMode& rdpcmMode )
[2]	1795	{
[1313]	1796	TComDataCU *pcCU=rTu.getCU();
	1797	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
	1798
	1799	if (!pcCU->isRDPCMEnabled(uiAbsPartIdx) \|\| ((pcCU->getTransformSkip(uiAbsPartIdx, compID) == 0) && !pcCU->getCUTransquantBypass(uiAbsPartIdx)))
[2]	1800	{
[1313]	1801	rdpcmMode = RDPCM_OFF;
	1802	}
	1803	else if ( pcCU->isIntra( uiAbsPartIdx ) )
[2]	1804	{
[1313]	1805	const ChromaFormat chFmt = pcCU->getPic()->getPicYuvOrg()->getChromaFormat();
	1806	const ChannelType chType = toChannelType(compID);
	1807	const UInt uiChPredMode = pcCU->getIntraDir( chType, uiAbsPartIdx );
	1808	const TComSPS *sps=pcCU->getSlice()->getSPS();
	1809	const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
	1810	const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
	1811	const UInt uiChFinalMode = ((chFmt == CHROMA_422) && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
	1812
	1813	if (uiChFinalMode == VER_IDX \|\| uiChFinalMode == HOR_IDX)
[2]	1814	{
[1313]	1815	rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
	1816	applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, uiAbsSum, rdpcmMode );
	1817	}
	1818	else
	1819	{
	1820	rdpcmMode = RDPCM_OFF;
	1821	}
	1822	}
	1823	else // not intra, need to select the best mode
	1824	{
	1825	const UInt uiWidth = rTu.getRect(compID).width;
	1826	const UInt uiHeight = rTu.getRect(compID).height;
	1827
	1828	RDPCMMode bestMode = NUMBER_OF_RDPCM_MODES;
	1829	TCoeff bestAbsSum = std::numeric_limits<TCoeff>::max();
	1830	TCoeff bestCoefficients[MAX_TU_SIZE * MAX_TU_SIZE];
	1831
	1832	for (UInt modeIndex = 0; modeIndex < NUMBER_OF_RDPCM_MODES; modeIndex++)
	1833	{
	1834	const RDPCMMode mode = RDPCMMode(modeIndex);
	1835
	1836	TCoeff currAbsSum = 0;
	1837
	1838	applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, currAbsSum, mode );
	1839
	1840	if (currAbsSum < bestAbsSum)
[2]	1841	{
[1313]	1842	bestMode = mode;
	1843	bestAbsSum = currAbsSum;
	1844	if (mode != RDPCM_OFF)
	1845	{
	1846	memcpy(bestCoefficients, pcCoeff, (uiWidth * uiHeight * sizeof(TCoeff)));
	1847	}
[2]	1848	}
	1849	}
[1313]	1850
	1851	rdpcmMode = bestMode;
	1852	uiAbsSum = bestAbsSum;
	1853
	1854	if (rdpcmMode != RDPCM_OFF) //the TU is re-transformed and quantised if DPCM_OFF is returned, so there is no need to preserve it here
	1855	{
	1856	memcpy(pcCoeff, bestCoefficients, (uiWidth * uiHeight * sizeof(TCoeff)));
	1857	}
[2]	1858	}
[1313]	1859
	1860	pcCU->setExplicitRdpcmModePartRange(rdpcmMode, compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
	1861	}
	1862
	1863	Void TComTrQuant::invRdpcmNxN( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride )
	1864	{
	1865	TComDataCU *pcCU=rTu.getCU();
	1866	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
	1867
	1868	if (pcCU->isRDPCMEnabled( uiAbsPartIdx ) && ((pcCU->getTransformSkip(uiAbsPartIdx, compID ) != 0) \|\| pcCU->getCUTransquantBypass(uiAbsPartIdx)))
[2]	1869	{
[1313]	1870	const UInt uiWidth = rTu.getRect(compID).width;
	1871	const UInt uiHeight = rTu.getRect(compID).height;
	1872
	1873	RDPCMMode rdpcmMode = RDPCM_OFF;
	1874
	1875	if ( pcCU->isIntra( uiAbsPartIdx ) )
[56]	1876	{
[1313]	1877	const ChromaFormat chFmt = pcCU->getPic()->getPicYuvRec()->getChromaFormat();
	1878	const ChannelType chType = toChannelType(compID);
	1879	const UInt uiChPredMode = pcCU->getIntraDir( chType, uiAbsPartIdx );
	1880	const TComSPS *sps=pcCU->getSlice()->getSPS();
	1881	const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
	1882	const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
	1883	const UInt uiChFinalMode = ((chFmt == CHROMA_422) && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
	1884
	1885	if (uiChFinalMode == VER_IDX \|\| uiChFinalMode == HOR_IDX)
	1886	{
	1887	rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
	1888	}
[56]	1889	}
[1313]	1890	else // not intra case
	1891	{
	1892	rdpcmMode = RDPCMMode(pcCU->getExplicitRdpcmMode( compID, uiAbsPartIdx ));
	1893	}
	1894
	1895	const TCoeff pelMin=(TCoeff) std::numeric_limits<Pel>::min();
	1896	const TCoeff pelMax=(TCoeff) std::numeric_limits<Pel>::max();
	1897	if (rdpcmMode == RDPCM_VER)
	1898	{
	1899	for( UInt uiX = 0; uiX < uiWidth; uiX++ )
	1900	{
	1901	Pel *pcCurResidual = pcResidual+uiX;
	1902	TCoeff accumulator = *pcCurResidual; // 32-bit accumulator
	1903	pcCurResidual+=uiStride;
	1904	for( UInt uiY = 1; uiY < uiHeight; uiY++, pcCurResidual+=uiStride )
	1905	{
	1906	accumulator += *(pcCurResidual);
	1907	*pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
	1908	}
	1909	}
	1910	}
	1911	else if (rdpcmMode == RDPCM_HOR)
	1912	{
	1913	for( UInt uiY = 0; uiY < uiHeight; uiY++ )
	1914	{
	1915	Pel pcCurResidual = pcResidual+uiYuiStride;
	1916	TCoeff accumulator = *pcCurResidual;
	1917	pcCurResidual++;
	1918	for( UInt uiX = 1; uiX < uiWidth; uiX++, pcCurResidual++ )
	1919	{
	1920	accumulator += *(pcCurResidual);
	1921	*pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
	1922	}
	1923	}
	1924	}
[2]	1925	}
	1926	}
	1927
	1928	// ------------------------------------------------------------------------------------------------
	1929	// Logical transform
	1930	// ------------------------------------------------------------------------------------------------
	1931
[1313]	1932	/** Wrapper function between HM interface and core NxN forward transform (2D)
	1933	* \param channelBitDepth bit depth of channel
	1934	* \param useDST
[2]	1935	* \param piBlkResi input data (residual)
[1313]	1936	* \param uiStride stride of input residual data
[2]	1937	* \param psCoeff output data (transform coefficients)
[1313]	1938	* \param iWidth transform width
	1939	* \param iHeight transform height
	1940	* \param maxLog2TrDynamicRange
[2]	1941	*/
[1313]	1942	Void TComTrQuant::xT( const Int channelBitDepth, Bool useDST, Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
[2]	1943	{
[1313]	1944	#if MATRIX_MULT
	1945	if( iWidth == iHeight)
	1946	{
	1947	xTr(channelBitDepth, piBlkResi, psCoeff, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
	1948	return;
	1949	}
	1950	#endif
	1951
	1952	TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
	1953	TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
	1954
	1955	for (Int y = 0; y < iHeight; y++)
	1956	{
	1957	for (Int x = 0; x < iWidth; x++)
	1958	{
	1959	block[(y * iWidth) + x] = piBlkResi[(y * uiStride) + x];
[56]	1960	}
[1313]	1961	}
	1962
	1963	xTrMxN( channelBitDepth, block, coeff, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
	1964
	1965	memcpy(psCoeff, coeff, (iWidth * iHeight * sizeof(TCoeff)));
[2]	1966	}
	1967
[1313]	1968	/** Wrapper function between HM interface and core NxN inverse transform (2D)
	1969	* \param channelBitDepth bit depth of channel
	1970	* \param useDST
[2]	1971	* \param plCoef input data (transform coefficients)
	1972	* \param pResidual output data (residual)
	1973	* \param uiStride stride of input residual data
[1313]	1974	* \param iWidth transform width
	1975	* \param iHeight transform height
	1976	* \param maxLog2TrDynamicRange
[2]	1977	*/
[1313]	1978	Void TComTrQuant::xIT( const Int channelBitDepth, Bool useDST, TCoeff* plCoef, Pel* pResidual, UInt uiStride, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
[2]	1979	{
[1313]	1980	#if MATRIX_MULT
	1981	if( iWidth == iHeight )
[56]	1982	{
[1313]	1983	xITr(channelBitDepth, plCoef, pResidual, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
	1984	return;
	1985	}
	1986	#endif
	1987
	1988	TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
	1989	TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
	1990
	1991	memcpy(coeff, plCoef, (iWidth * iHeight * sizeof(TCoeff)));
	1992
	1993	xITrMxN( channelBitDepth, coeff, block, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
	1994
	1995	for (Int y = 0; y < iHeight; y++)
	1996	{
	1997	for (Int x = 0; x < iWidth; x++)
[56]	1998	{
[1313]	1999	pResidual[(y * uiStride) + x] = Pel(block[(y * iWidth) + x]);
[56]	2000	}
	2001	}
[2]	2002	}
[1313]	2003
[608]	2004	/** Wrapper function between HM interface and core 4x4 transform skipping
	2005	* \param piBlkResi input data (residual)
[1313]	2006	* \param uiStride stride of input residual data
[608]	2007	* \param psCoeff output data (transform coefficients)
[1313]	2008	* \param rTu reference to transform data
	2009	* \param component colour component
[608]	2010	*/
[1313]	2011	Void TComTrQuant::xTransformSkip( Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, TComTU &rTu, const ComponentID component )
[608]	2012	{
[1313]	2013	const TComRectangle &rect = rTu.getRect(component);
	2014	const Int width = rect.width;
	2015	const Int height = rect.height;
	2016	const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
	2017	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
	2018
	2019	Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
	2020	if (rTu.getCU()->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
[608]	2021	{
[1313]	2022	iTransformShift = std::max<Int>(0, iTransformShift);
	2023	}
	2024
	2025	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
	2026	const UInt uiSizeMinus1 = (width * height) - 1;
	2027
	2028	if (iTransformShift >= 0)
	2029	{
	2030	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
	2031	{
	2032	for (UInt x = 0; x < width; x++, coefficientIndex++)
[608]	2033	{
[1313]	2034	psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = TCoeff(piBlkResi[(y * uiStride) + x]) << iTransformShift;
[608]	2035	}
	2036	}
	2037	}
[1313]	2038	else //for very high bit depths
[608]	2039	{
[1313]	2040	iTransformShift = -iTransformShift;
	2041	const TCoeff offset = 1 << (iTransformShift - 1);
	2042
	2043	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
	2044	{
	2045	for (UInt x = 0; x < width; x++, coefficientIndex++)
[608]	2046	{
[1313]	2047	psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = (TCoeff(piBlkResi[(y * uiStride) + x]) + offset) >> iTransformShift;
[608]	2048	}
	2049	}
	2050	}
	2051	}
	2052
[1313]	2053	/** Wrapper function between HM interface and core NxN transform skipping
[608]	2054	* \param plCoef input data (coefficients)
	2055	* \param pResidual output data (residual)
	2056	* \param uiStride stride of input residual data
[1313]	2057	* \param rTu reference to transform data
	2058	* \param component colour component ID
[608]	2059	*/
[1313]	2060	Void TComTrQuant::xITransformSkip( TCoeff* plCoef, Pel* pResidual, UInt uiStride, TComTU &rTu, const ComponentID component )
[608]	2061	{
[1313]	2062	const TComRectangle &rect = rTu.getRect(component);
	2063	const Int width = rect.width;
	2064	const Int height = rect.height;
	2065	const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
	2066	#if O0043_BEST_EFFORT_DECODING
	2067	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getStreamBitDepth(toChannelType(component));
	2068	#else
	2069	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
	2070	#endif
	2071
	2072	Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
	2073	if (rTu.getCU()->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
[608]	2074	{
[1313]	2075	iTransformShift = std::max<Int>(0, iTransformShift);
	2076	}
	2077
	2078	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
	2079	const UInt uiSizeMinus1 = (width * height) - 1;
	2080
	2081	if (iTransformShift >= 0)
	2082	{
	2083	const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
	2084
	2085	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
	2086	{
	2087	for (UInt x = 0; x < width; x++, coefficientIndex++)
[608]	2088	{
[1313]	2089	pResidual[(y * uiStride) + x] = Pel((plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] + offset) >> iTransformShift);
	2090	}
[608]	2091	}
	2092	}
[1313]	2093	else //for very high bit depths
[608]	2094	{
[1313]	2095	iTransformShift = -iTransformShift;
	2096
	2097	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
	2098	{
	2099	for (UInt x = 0; x < width; x++, coefficientIndex++)
[608]	2100	{
[1313]	2101	pResidual[(y * uiStride) + x] = Pel(plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] << iTransformShift);
[608]	2102	}
	2103	}
	2104	}
	2105	}
	2106
[2]	2107	/** RDOQ with CABAC
[1313]	2108	* \param rTu reference to transform data
[2]	2109	* \param plSrcCoeff pointer to input buffer
	2110	* \param piDstCoeff reference to pointer to output buffer
[1313]	2111	* \param piArlDstCoeff
[2]	2112	* \param uiAbsSum reference to absolute sum of quantized transform coefficient
[1313]	2113	* \param compID colour component ID
	2114	* \param cQP reference to quantization parameters
	2115
[2]	2116	* Rate distortion optimized quantization for entropy
	2117	* coding engines using probability models like CABAC
	2118	*/
[1313]	2119	Void TComTrQuant::xRateDistOptQuant ( TComTU &rTu,
	2120	TCoeff * plSrcCoeff,
	2121	TCoeff * piDstCoeff,
[56]	2122	#if ADAPTIVE_QP_SELECTION
[1313]	2123	TCoeff * piArlDstCoeff,
[56]	2124	#endif
[1313]	2125	TCoeff &uiAbsSum,
	2126	const ComponentID compID,
	2127	const QpParam &cQP )
[2]	2128	{
[1313]	2129	const TComRectangle & rect = rTu.getRect(compID);
	2130	const UInt uiWidth = rect.width;
	2131	const UInt uiHeight = rect.height;
	2132	TComDataCU * pcCU = rTu.getCU();
	2133	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	2134	const ChannelType channelType = toChannelType(compID);
	2135	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
	2136
	2137	const Bool extendedPrecision = pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
	2138	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	2139	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(channelType);
	2140
	2141	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
	2142	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
	2143	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
	2144	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
	2145	*/
	2146
	2147	// Represents scaling through forward transform
	2148	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
	2149	if ((pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && extendedPrecision)
	2150	{
	2151	iTransformShift = std::max<Int>(0, iTransformShift);
	2152	}
	2153
	2154	const Bool bUseGolombRiceParameterAdaptation = pcCU->getSlice()->getSPS()->getSpsRangeExtension().getPersistentRiceAdaptationEnabledFlag();
	2155	const UInt initialGolombRiceParameter = m_pcEstBitsSbac->golombRiceAdaptationStatistics[rTu.getGolombRiceStatisticsIndex(compID)] / RExt__GOLOMB_RICE_INCREMENT_DIVISOR;
	2156	UInt uiGoRiceParam = initialGolombRiceParameter;
	2157	Double d64BlockUncodedCost = 0;
	2158	const UInt uiLog2BlockWidth = g_aucConvertToBit[ uiWidth ] + 2;
	2159	const UInt uiLog2BlockHeight = g_aucConvertToBit[ uiHeight ] + 2;
	2160	const UInt uiMaxNumCoeff = uiWidth * uiHeight;
	2161	assert(compID<MAX_NUM_COMPONENT);
	2162
	2163	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
[872]	2164	assert(scalingListType < SCALING_LIST_NUM);
[1313]	2165
[56]	2166	#if ADAPTIVE_QP_SELECTION
[1313]	2167	memset(piArlDstCoeff, 0, sizeof(TCoeff) * uiMaxNumCoeff);
	2168	#endif
	2169
	2170	Double pdCostCoeff [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2171	Double pdCostSig [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2172	Double pdCostCoeff0[ MAX_TU_SIZE * MAX_TU_SIZE ];
	2173	memset( pdCostCoeff, 0, sizeof(Double) * uiMaxNumCoeff );
	2174	memset( pdCostSig, 0, sizeof(Double) * uiMaxNumCoeff );
	2175	Int rateIncUp [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2176	Int rateIncDown [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2177	Int sigRateDelta[ MAX_TU_SIZE * MAX_TU_SIZE ];
	2178	TCoeff deltaU [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2179	memset( rateIncUp, 0, sizeof(Int ) * uiMaxNumCoeff );
	2180	memset( rateIncDown, 0, sizeof(Int ) * uiMaxNumCoeff );
	2181	memset( sigRateDelta, 0, sizeof(Int ) * uiMaxNumCoeff );
	2182	memset( deltaU, 0, sizeof(TCoeff) * uiMaxNumCoeff );
	2183
	2184	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift; // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits
	2185	const Double *const pdErrScale = getErrScaleCoeff(scalingListType, (uiLog2TrSize-2), cQP.rem);
	2186	const Int *const piQCoef = getQuantCoeff(scalingListType, cQP.rem, (uiLog2TrSize-2));
	2187
	2188	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
	2189	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
	2190	const Double defaultErrorScale = getErrScaleCoeffNoScalingList(scalingListType, (uiLog2TrSize-2), cQP.rem);
	2191
	2192	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
	2193	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
	2194
	2195	#if ADAPTIVE_QP_SELECTION
[56]	2196	Int iQBitsC = iQBits - ARL_C_PRECISION;
	2197	Int iAddC = 1 << (iQBitsC-1);
	2198	#endif
[1313]	2199
	2200	TUEntropyCodingParameters codingParameters;
	2201	getTUEntropyCodingParameters(codingParameters, rTu, compID);
	2202	const UInt uiCGSize = (1 << MLS_CG_SIZE);
	2203
[56]	2204	Double pdCostCoeffGroupSig[ MLS_GRP_NUM ];
	2205	UInt uiSigCoeffGroupFlag[ MLS_GRP_NUM ];
	2206	Int iCGLastScanPos = -1;
[1313]	2207
[56]	2208	UInt uiCtxSet = 0;
	2209	Int c1 = 1;
	2210	Int c2 = 0;
	2211	Double d64BaseCost = 0;
	2212	Int iLastScanPos = -1;
[1313]	2213
[56]	2214	UInt c1Idx = 0;
	2215	UInt c2Idx = 0;
	2216	Int baseLevel;
[1313]	2217
	2218	memset( pdCostCoeffGroupSig, 0, sizeof(Double) * MLS_GRP_NUM );
	2219	memset( uiSigCoeffGroupFlag, 0, sizeof(UInt) * MLS_GRP_NUM );
	2220
[608]	2221	UInt uiCGNum = uiWidth * uiHeight >> MLS_CG_SIZE;
	2222	Int iScanPos;
[1313]	2223	coeffGroupRDStats rdStats;
	2224
	2225	const UInt significanceMapContextOffset = getSignificanceMapContextOffset(compID);
	2226
[608]	2227	for (Int iCGScanPos = uiCGNum-1; iCGScanPos >= 0; iCGScanPos--)
[56]	2228	{
[1313]	2229	UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
	2230	UInt uiCGPosY = uiCGBlkPos / codingParameters.widthInGroups;
	2231	UInt uiCGPosX = uiCGBlkPos - (uiCGPosY * codingParameters.widthInGroups);
	2232
	2233	memset( &rdStats, 0, sizeof (coeffGroupRDStats));
	2234
	2235	const Int patternSigCtx = TComTrQuant::calcPatternSigCtx(uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups);
	2236
[608]	2237	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
[56]	2238	{
[608]	2239	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
	2240	//===== quantization =====
[1313]	2241	UInt uiBlkPos = codingParameters.scan[iScanPos];
[608]	2242	// set coeff
[1313]	2243
	2244	const Int quantisationCoefficient = (enableScalingLists) ? piQCoef [uiBlkPos] : defaultQuantisationCoefficient;
	2245	const Double errorScale = (enableScalingLists) ? pdErrScale[uiBlkPos] : defaultErrorScale;
	2246
	2247	const Int64 tmpLevel = Int64(abs(plSrcCoeff[ uiBlkPos ])) * quantisationCoefficient;
	2248
	2249	const Intermediate_Int lLevelDouble = (Intermediate_Int)min<Int64>(tmpLevel, std::numeric_limits<Intermediate_Int>::max() - (Intermediate_Int(1) << (iQBits - 1)));
	2250
[608]	2251	#if ADAPTIVE_QP_SELECTION
	2252	if( m_bUseAdaptQpSelect )
[2]	2253	{
[1313]	2254	piArlDstCoeff[uiBlkPos] = (TCoeff)(( lLevelDouble + iAddC) >> iQBitsC );
[56]	2255	}
[608]	2256	#endif
[1313]	2257	const UInt uiMaxAbsLevel = std::min<UInt>(UInt(entropyCodingMaximum), UInt((lLevelDouble + (Intermediate_Int(1) << (iQBits - 1))) >> iQBits));
	2258
	2259	const Double dErr = Double( lLevelDouble );
	2260	pdCostCoeff0[ iScanPos ] = dErr * dErr * errorScale;
[608]	2261	d64BlockUncodedCost += pdCostCoeff0[ iScanPos ];
	2262	piDstCoeff[ uiBlkPos ] = uiMaxAbsLevel;
[1313]	2263
[608]	2264	if ( uiMaxAbsLevel > 0 && iLastScanPos < 0 )
	2265	{
	2266	iLastScanPos = iScanPos;
[1313]	2267	uiCtxSet = getContextSetIndex(compID, (iScanPos >> MLS_CG_SIZE), 0);
[608]	2268	iCGLastScanPos = iCGScanPos;
	2269	}
[1313]	2270
[608]	2271	if ( iLastScanPos >= 0 )
	2272	{
	2273	//===== coefficient level estimation =====
	2274	UInt uiLevel;
[1313]	2275	UInt uiOneCtx = (NUM_ONE_FLAG_CTX_PER_SET * uiCtxSet) + c1;
	2276	UInt uiAbsCtx = (NUM_ABS_FLAG_CTX_PER_SET * uiCtxSet) + c2;
	2277
[608]	2278	if( iScanPos == iLastScanPos )
[2]	2279	{
[1313]	2280	uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
	2281	lLevelDouble, uiMaxAbsLevel, significanceMapContextOffset, uiOneCtx, uiAbsCtx, uiGoRiceParam,
	2282	c1Idx, c2Idx, iQBits, errorScale, 1, extendedPrecision, maxLog2TrDynamicRange
	2283	);
[2]	2284	}
[608]	2285	else
[2]	2286	{
[1313]	2287	UShort uiCtxSig = significanceMapContextOffset + getSigCtxInc( patternSigCtx, codingParameters, iScanPos, uiLog2BlockWidth, uiLog2BlockHeight, channelType );
	2288
[608]	2289	uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
[1313]	2290	lLevelDouble, uiMaxAbsLevel, uiCtxSig, uiOneCtx, uiAbsCtx, uiGoRiceParam,
	2291	c1Idx, c2Idx, iQBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange
	2292	);
	2293
[608]	2294	sigRateDelta[ uiBlkPos ] = m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 1 ] - m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 0 ];
[2]	2295	}
[1313]	2296
	2297	deltaU[ uiBlkPos ] = TCoeff((lLevelDouble - (Intermediate_Int(uiLevel) << iQBits)) >> (iQBits-8));
	2298
[608]	2299	if( uiLevel > 0 )
[2]	2300	{
[1313]	2301	Int rateNow = xGetICRate( uiLevel, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange );
	2302	rateIncUp [ uiBlkPos ] = xGetICRate( uiLevel+1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
	2303	rateIncDown [ uiBlkPos ] = xGetICRate( uiLevel-1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
[608]	2304	}
	2305	else // uiLevel == 0
	2306	{
	2307	rateIncUp [ uiBlkPos ] = m_pcEstBitsSbac->m_greaterOneBits[ uiOneCtx ][ 0 ];
	2308	}
	2309	piDstCoeff[ uiBlkPos ] = uiLevel;
	2310	d64BaseCost += pdCostCoeff [ iScanPos ];
[1313]	2311
[608]	2312	baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
	2313	if( uiLevel >= baseLevel )
	2314	{
[1313]	2315	if (uiLevel > 3*(1<<uiGoRiceParam))
[2]	2316	{
[1313]	2317	uiGoRiceParam = bUseGolombRiceParameterAdaptation ? (uiGoRiceParam + 1) : (std::min<UInt>((uiGoRiceParam + 1), 4));
[56]	2318	}
[2]	2319	}
[608]	2320	if ( uiLevel >= 1)
[56]	2321	{
[608]	2322	c1Idx ++;
[56]	2323	}
[1313]	2324
[608]	2325	//===== update bin model =====
	2326	if( uiLevel > 1 )
[56]	2327	{
[1313]	2328	c1 = 0;
[608]	2329	c2 += (c2 < 2);
	2330	c2Idx ++;
[56]	2331	}
[608]	2332	else if( (c1 < 3) && (c1 > 0) && uiLevel)
[56]	2333	{
[608]	2334	c1++;
	2335	}
[1313]	2336
[608]	2337	//===== context set update =====
[1313]	2338	if( ( iScanPos % uiCGSize == 0 ) && ( iScanPos > 0 ) )
[608]	2339	{
[1313]	2340	uiCtxSet = getContextSetIndex(compID, ((iScanPos - 1) >> MLS_CG_SIZE), (c1 == 0)); //(iScanPos - 1) because we do this before entering the final group
	2341	c1 = 1;
[608]	2342	c2 = 0;
[1313]	2343	c1Idx = 0;
	2344	c2Idx = 0;
	2345	uiGoRiceParam = initialGolombRiceParameter;
[56]	2346	}
[608]	2347	}
	2348	else
[2]	2349	{
[608]	2350	d64BaseCost += pdCostCoeff0[ iScanPos ];
	2351	}
	2352	rdStats.d64SigCost += pdCostSig[ iScanPos ];
	2353	if (iScanPosinCG == 0 )
	2354	{
	2355	rdStats.d64SigCost_0 = pdCostSig[ iScanPos ];
	2356	}
	2357	if (piDstCoeff[ uiBlkPos ] )
	2358	{
	2359	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 1;
	2360	rdStats.d64CodedLevelandDist += pdCostCoeff[ iScanPos ] - pdCostSig[ iScanPos ];
	2361	rdStats.d64UncodedDist += pdCostCoeff0[ iScanPos ];
	2362	if ( iScanPosinCG != 0 )
[2]	2363	{
[608]	2364	rdStats.iNNZbeforePos0++;
	2365	}
	2366	}
	2367	} //end for (iScanPosinCG)
[1313]	2368
	2369	if (iCGLastScanPos >= 0)
[608]	2370	{
	2371	if( iCGScanPos )
	2372	{
	2373	if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
	2374	{
[1313]	2375	UInt uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
	2376	d64BaseCost += xGetRateSigCoeffGroup(0, uiCtxSig) - rdStats.d64SigCost;;
	2377	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
	2378	}
[608]	2379	else
	2380	{
	2381	if (iCGScanPos < iCGLastScanPos) //skip the last coefficient group, which will be handled together with last position below.
[2]	2382	{
[1313]	2383	if ( rdStats.iNNZbeforePos0 == 0 )
[56]	2384	{
[608]	2385	d64BaseCost -= rdStats.d64SigCost_0;
	2386	rdStats.d64SigCost -= rdStats.d64SigCost_0;
	2387	}
	2388	// rd-cost if SigCoeffGroupFlag = 0, initialization
	2389	Double d64CostZeroCG = d64BaseCost;
[1313]	2390
[608]	2391	// add SigCoeffGroupFlag cost to total cost
[1313]	2392	UInt uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
	2393
[608]	2394	if (iCGScanPos < iCGLastScanPos)
	2395	{
[1313]	2396	d64BaseCost += xGetRateSigCoeffGroup(1, uiCtxSig);
	2397	d64CostZeroCG += xGetRateSigCoeffGroup(0, uiCtxSig);
	2398	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(1, uiCtxSig);
[608]	2399	}
[1313]	2400
[608]	2401	// try to convert the current coeff group from non-zero to all-zero
	2402	d64CostZeroCG += rdStats.d64UncodedDist; // distortion for resetting non-zero levels to zero levels
	2403	d64CostZeroCG -= rdStats.d64CodedLevelandDist; // distortion and level cost for keeping all non-zero levels
	2404	d64CostZeroCG -= rdStats.d64SigCost; // sig cost for all coeffs, including zero levels and non-zerl levels
[1313]	2405
[608]	2406	// if we can save cost, change this block to all-zero block
[1313]	2407	if ( d64CostZeroCG < d64BaseCost )
[608]	2408	{
	2409	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 0;
	2410	d64BaseCost = d64CostZeroCG;
[56]	2411	if (iCGScanPos < iCGLastScanPos)
	2412	{
[1313]	2413	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
[56]	2414	}
[1313]	2415	// reset coeffs to 0 in this block
[608]	2416	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
[56]	2417	{
[608]	2418	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
[1313]	2419	UInt uiBlkPos = codingParameters.scan[ iScanPos ];
	2420
[608]	2421	if (piDstCoeff[ uiBlkPos ])
[56]	2422	{
[608]	2423	piDstCoeff [ uiBlkPos ] = 0;
	2424	pdCostCoeff[ iScanPos ] = pdCostCoeff0[ iScanPos ];
	2425	pdCostSig [ iScanPos ] = 0;
[56]	2426	}
[608]	2427	}
[1313]	2428	} // end if ( d64CostAllZeros < d64BaseCost )
[608]	2429	}
	2430	} // end if if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
[2]	2431	}
[608]	2432	else
	2433	{
	2434	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 1;
	2435	}
	2436	}
	2437	} //end for (iCGScanPos)
[1313]	2438
[56]	2439	//===== estimate last position =====
	2440	if ( iLastScanPos < 0 )
	2441	{
	2442	return;
	2443	}
[1313]	2444
[56]	2445	Double d64BestCost = 0;
	2446	Int ui16CtxCbf = 0;
	2447	Int iBestLastIdxP1 = 0;
[1313]	2448	if( !pcCU->isIntra( uiAbsPartIdx ) && isLuma(compID) && pcCU->getTransformIdx( uiAbsPartIdx ) == 0 )
[2]	2449	{
[56]	2450	ui16CtxCbf = 0;
	2451	d64BestCost = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockRootCbpBits[ ui16CtxCbf ][ 0 ] );
	2452	d64BaseCost += xGetICost( m_pcEstBitsSbac->blockRootCbpBits[ ui16CtxCbf ][ 1 ] );
[2]	2453	}
	2454	else
	2455	{
[1313]	2456	ui16CtxCbf = pcCU->getCtxQtCbf( rTu, channelType );
	2457	ui16CtxCbf += getCBFContextOffset(compID);
[56]	2458	d64BestCost = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 0 ] );
	2459	d64BaseCost += xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 1 ] );
[2]	2460	}
[1313]	2461
	2462
[608]	2463	Bool bFoundLast = false;
	2464	for (Int iCGScanPos = iCGLastScanPos; iCGScanPos >= 0; iCGScanPos--)
	2465	{
[1313]	2466	UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
	2467
	2468	d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ];
[608]	2469	if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
[1313]	2470	{
[608]	2471	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
	2472	{
	2473	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
[1313]	2474
	2475	if (iScanPos > iLastScanPos)
	2476	{
	2477	continue;
	2478	}
	2479	UInt uiBlkPos = codingParameters.scan[iScanPos];
	2480
[608]	2481	if( piDstCoeff[ uiBlkPos ] )
[56]	2482	{
[1313]	2483	UInt uiPosY = uiBlkPos >> uiLog2BlockWidth;
	2484	UInt uiPosX = uiBlkPos - ( uiPosY << uiLog2BlockWidth );
	2485
	2486	Double d64CostLast= codingParameters.scanType == SCAN_VER ? xGetRateLast( uiPosY, uiPosX, compID ) : xGetRateLast( uiPosX, uiPosY, compID );
[608]	2487	Double totalCost = d64BaseCost + d64CostLast - pdCostSig[ iScanPos ];
[1313]	2488
[608]	2489	if( totalCost < d64BestCost )
[56]	2490	{
[608]	2491	iBestLastIdxP1 = iScanPos + 1;
	2492	d64BestCost = totalCost;
[56]	2493	}
[608]	2494	if( piDstCoeff[ uiBlkPos ] > 1 )
[56]	2495	{
[608]	2496	bFoundLast = true;
	2497	break;
[56]	2498	}
[608]	2499	d64BaseCost -= pdCostCoeff[ iScanPos ];
	2500	d64BaseCost += pdCostCoeff0[ iScanPos ];
	2501	}
	2502	else
[56]	2503	{
[608]	2504	d64BaseCost -= pdCostSig[ iScanPos ];
[56]	2505	}
[1313]	2506	} //end for
[608]	2507	if (bFoundLast)
	2508	{
	2509	break;
	2510	}
	2511	} // end if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
[1313]	2512	} // end for
	2513
	2514
[56]	2515	for ( Int scanPos = 0; scanPos < iBestLastIdxP1; scanPos++ )
	2516	{
[1313]	2517	Int blkPos = codingParameters.scan[ scanPos ];
	2518	TCoeff level = piDstCoeff[ blkPos ];
[56]	2519	uiAbsSum += level;
	2520	piDstCoeff[ blkPos ] = ( plSrcCoeff[ blkPos ] < 0 ) ? -level : level;
[2]	2521	}
[1313]	2522
[2]	2523	//===== clean uncoded coefficients =====
[56]	2524	for ( Int scanPos = iBestLastIdxP1; scanPos <= iLastScanPos; scanPos++ )
[2]	2525	{
[1313]	2526	piDstCoeff[ codingParameters.scan[ scanPos ] ] = 0;
[56]	2527	}
[1313]	2528
	2529
[1413]	2530	if( pcCU->getSlice()->getPPS()->getSignDataHidingEnabledFlag() && uiAbsSum>=2)
[56]	2531	{
[1313]	2532	const Double inverseQuantScale = Double(g_invQuantScales[cQP.rem]);
	2533	Int64 rdFactor = (Int64)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per))
	2534	/ m_dLambda / 16 / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth - 8)))
	2535	+ 0.5);
	2536
[56]	2537	Int lastCG = -1;
	2538	Int absSum = 0 ;
	2539	Int n ;
[1313]	2540
	2541	for( Int subSet = (uiWidth*uiHeight-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
[2]	2542	{
[1313]	2543	Int subPos = subSet << MLS_CG_SIZE;
	2544	Int firstNZPosInCG=uiCGSize , lastNZPosInCG=-1 ;
[56]	2545	absSum = 0 ;
[1313]	2546
	2547	for(n = uiCGSize-1; n >= 0; --n )
[56]	2548	{
[1313]	2549	if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
[56]	2550	{
	2551	lastNZPosInCG = n;
	2552	break;
	2553	}
	2554	}
[1313]	2555
	2556	for(n = 0; n <uiCGSize; n++ )
[56]	2557	{
[1313]	2558	if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
[56]	2559	{
	2560	firstNZPosInCG = n;
	2561	break;
	2562	}
	2563	}
[1313]	2564
[56]	2565	for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
	2566	{
[1313]	2567	absSum += Int(piDstCoeff[ codingParameters.scan[ n + subPos ]]);
[56]	2568	}
[1313]	2569
[608]	2570	if(lastNZPosInCG>=0 && lastCG==-1)
[2]	2571	{
[1313]	2572	lastCG = 1;
	2573	}
	2574
[608]	2575	if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
	2576	{
[1313]	2577	UInt signbit = (piDstCoeff[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1);
[56]	2578	if( signbit!=(absSum&0x1) ) // hide but need tune
	2579	{
[1313]	2580	// calculate the cost
	2581	Int64 minCostInc = std::numeric_limits<Int64>::max(), curCost = std::numeric_limits<Int64>::max();
	2582	Int minPos = -1, finalChange = 0, curChange = 0;
	2583
	2584	for( n = (lastCG==1?lastNZPosInCG:uiCGSize-1) ; n >= 0; --n )
[56]	2585	{
[1313]	2586	UInt uiBlkPos = codingParameters.scan[ n + subPos ];
[56]	2587	if(piDstCoeff[ uiBlkPos ] != 0 )
	2588	{
[1313]	2589	Int64 costUp = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos];
	2590	Int64 costDown = rdFactor * ( deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos]
	2591	- ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);
	2592
[56]	2593	if(lastCG==1 && lastNZPosInCG==n && abs(piDstCoeff[uiBlkPos])==1)
	2594	{
[1313]	2595	costDown -= (4<<15);
[56]	2596	}
[1313]	2597
[56]	2598	if(costUp<costDown)
[1313]	2599	{
[56]	2600	curCost = costUp;
[1313]	2601	curChange = 1;
[56]	2602	}
[1313]	2603	else
[56]	2604	{
[1313]	2605	curChange = -1;
[56]	2606	if(n==firstNZPosInCG && abs(piDstCoeff[uiBlkPos])==1)
	2607	{
[1313]	2608	curCost = std::numeric_limits<Int64>::max();
[56]	2609	}
	2610	else
	2611	{
[1313]	2612	curCost = costDown;
[56]	2613	}
	2614	}
	2615	}
	2616	else
	2617	{
[1313]	2618	curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<15) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ;
[56]	2619	curChange = 1 ;
[1313]	2620
[56]	2621	if(n<firstNZPosInCG)
	2622	{
	2623	UInt thissignbit = (plSrcCoeff[uiBlkPos]>=0?0:1);
	2624	if(thissignbit != signbit )
	2625	{
[1313]	2626	curCost = std::numeric_limits<Int64>::max();
[56]	2627	}
	2628	}
	2629	}
[1313]	2630
[56]	2631	if( curCost<minCostInc)
	2632	{
[1313]	2633	minCostInc = curCost;
	2634	finalChange = curChange;
	2635	minPos = uiBlkPos;
[56]	2636	}
	2637	}
[1313]	2638
	2639	if(piDstCoeff[minPos] == entropyCodingMaximum \|\| piDstCoeff[minPos] == entropyCodingMinimum)
[56]	2640	{
	2641	finalChange = -1;
	2642	}
[1313]	2643
[56]	2644	if(plSrcCoeff[minPos]>=0)
	2645	{
	2646	piDstCoeff[minPos] += finalChange ;
	2647	}
	2648	else
	2649	{
[1313]	2650	piDstCoeff[minPos] -= finalChange ;
	2651	}
[56]	2652	}
[2]	2653	}
[1313]	2654
[56]	2655	if(lastCG==1)
[2]	2656	{
[1313]	2657	lastCG=0 ;
[56]	2658	}
[2]	2659	}
	2660	}
	2661	}
	2662
[1313]	2663
[608]	2664	/** Pattern decision for context derivation process of significant_coeff_flag
	2665	* \param sigCoeffGroupFlag pointer to prior coded significant coeff group
[1313]	2666	* \param uiCGPosX column of current coefficient group
	2667	* \param uiCGPosY row of current coefficient group
	2668	* \param widthInGroups width of the block
	2669	* \param heightInGroups height of the block
[608]	2670	* \returns pattern for current coefficient group
	2671	*/
[1313]	2672	Int TComTrQuant::calcPatternSigCtx( const UInt* sigCoeffGroupFlag, UInt uiCGPosX, UInt uiCGPosY, UInt widthInGroups, UInt heightInGroups )
[608]	2673	{
[1313]	2674	if ((widthInGroups <= 1) && (heightInGroups <= 1))
	2675	{
	2676	return 0;
	2677	}
[608]	2678
[1313]	2679	const Bool rightAvailable = uiCGPosX < (widthInGroups - 1);
	2680	const Bool belowAvailable = uiCGPosY < (heightInGroups - 1);
	2681
[608]	2682	UInt sigRight = 0;
	2683	UInt sigLower = 0;
	2684
[1313]	2685	if (rightAvailable)
[608]	2686	{
[1313]	2687	sigRight = ((sigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
[608]	2688	}
[1313]	2689	if (belowAvailable)
[608]	2690	{
[1313]	2691	sigLower = ((sigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
[608]	2692	}
[1313]	2693
	2694	return sigRight + (sigLower << 1);
[608]	2695	}
	2696
[1313]	2697
[2]	2698	/** Context derivation process of coeff_abs_significant_flag
[608]	2699	* \param patternSigCtx pattern for current coefficient group
[1313]	2700	* \param codingParameters coding parameters for the TU (includes the scan)
	2701	* \param scanPosition current position in scan order
	2702	* \param log2BlockWidth log2 width of the block
	2703	* \param log2BlockHeight log2 height of the block
	2704	* \param chanType channel type (CHANNEL_TYPE_LUMA/CHROMA)
[2]	2705	* \returns ctxInc for current scan position
	2706	*/
[1313]	2707	Int TComTrQuant::getSigCtxInc ( Int patternSigCtx,
	2708	const TUEntropyCodingParameters &codingParameters,
	2709	const Int scanPosition,
	2710	const Int log2BlockWidth,
	2711	const Int log2BlockHeight,
	2712	const ChannelType chanType)
[2]	2713	{
[1313]	2714	if (codingParameters.firstSignificanceMapContext == significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE])
[2]	2715	{
[1313]	2716	//single context mode
	2717	return significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE];
[2]	2718	}
[608]	2719
[1313]	2720	const UInt rasterPosition = codingParameters.scan[scanPosition];
	2721	const UInt posY = rasterPosition >> log2BlockWidth;
	2722	const UInt posX = rasterPosition - (posY << log2BlockWidth);
	2723
	2724	if ((posX + posY) == 0)
[2]	2725	{
[1313]	2726	return 0; //special case for the DC context variable
[2]	2727	}
[56]	2728
[1313]	2729	Int offset = MAX_INT;
[608]	2730
[1313]	2731	if ((log2BlockWidth == 2) && (log2BlockHeight == 2)) //4x4
[2]	2732	{
[1313]	2733	offset = ctxIndMap4x4[ (4 * posY) + posX ];
[2]	2734	}
[608]	2735	else
[2]	2736	{
[1313]	2737	Int cnt = 0;
	2738
	2739	switch (patternSigCtx)
	2740	{
	2741	//------------------
	2742
	2743	case 0: //neither neighbouring group is significant
	2744	{
	2745	const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH) - 1);
	2746	const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
	2747	const Int posTotalInSubset = posXinSubset + posYinSubset;
	2748
	2749	//first N coefficients in scan order use 2; the next few use 1; the rest use 0.
	2750	const UInt context1Threshold = NEIGHBOURHOOD_00_CONTEXT_1_THRESHOLD_4x4;
	2751	const UInt context2Threshold = NEIGHBOURHOOD_00_CONTEXT_2_THRESHOLD_4x4;
	2752
	2753	cnt = (posTotalInSubset >= context1Threshold) ? 0 : ((posTotalInSubset >= context2Threshold) ? 1 : 2);
	2754	}
	2755	break;
	2756
	2757	//------------------
	2758
	2759	case 1: //right group is significant, below is not
	2760	{
	2761	const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
	2762	const Int groupHeight = 1 << MLS_CG_LOG2_HEIGHT;
	2763
	2764	cnt = (posYinSubset >= (groupHeight >> 1)) ? 0 : ((posYinSubset >= (groupHeight >> 2)) ? 1 : 2); //top quarter uses 2; second-from-top quarter uses 1; bottom half uses 0
	2765	}
	2766	break;
	2767
	2768	//------------------
	2769
	2770	case 2: //below group is significant, right is not
	2771	{
	2772	const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH) - 1);
	2773	const Int groupWidth = 1 << MLS_CG_LOG2_WIDTH;
	2774
	2775	cnt = (posXinSubset >= (groupWidth >> 1)) ? 0 : ((posXinSubset >= (groupWidth >> 2)) ? 1 : 2); //left quarter uses 2; second-from-left quarter uses 1; right half uses 0
	2776	}
	2777	break;
	2778
	2779	//------------------
	2780
	2781	case 3: //both neighbouring groups are significant
	2782	{
	2783	cnt = 2;
	2784	}
	2785	break;
	2786
	2787	//------------------
	2788
	2789	default:
	2790	std::cerr << "ERROR: Invalid patternSigCtx \"" << Int(patternSigCtx) << "\" in getSigCtxInc" << std::endl;
	2791	exit(1);
	2792	break;
	2793	}
	2794
	2795	//------------------------------------------------
	2796
	2797	const Bool notFirstGroup = ((posX >> MLS_CG_LOG2_WIDTH) + (posY >> MLS_CG_LOG2_HEIGHT)) > 0;
	2798
	2799	offset = (notFirstGroup ? notFirstGroupNeighbourhoodContextOffset[chanType] : 0) + cnt;
[2]	2800	}
[56]	2801
[1313]	2802	return codingParameters.firstSignificanceMapContext + offset;
[2]	2803	}
	2804
[1313]	2805
[2]	2806	/** Get the best level in RD sense
[1313]	2807	*
[2]	2808	* \returns best quantized transform level for given scan position
[1313]	2809	*
[2]	2810	* This method calculates the best quantized transform level for a given scan position.
	2811	*/
[1313]	2812	__inline UInt TComTrQuant::xGetCodedLevel ( Double& rd64CodedCost, //< reference to coded cost
	2813	Double& rd64CodedCost0, //< reference to cost when coefficient is 0
	2814	Double& rd64CodedCostSig, //< rd64CodedCostSig reference to cost of significant coefficient
	2815	Intermediate_Int lLevelDouble, //< reference to unscaled quantized level
	2816	UInt uiMaxAbsLevel, //< scaled quantized level
	2817	UShort ui16CtxNumSig, //< current ctxInc for coeff_abs_significant_flag
	2818	UShort ui16CtxNumOne, //< current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
	2819	UShort ui16CtxNumAbs, //< current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
	2820	UShort ui16AbsGoRice, //< current Rice parameter for coeff_abs_level_minus3
	2821	UInt c1Idx, //<
	2822	UInt c2Idx, //<
	2823	Int iQBits, //< quantization step size
	2824	Double errorScale, //<
	2825	Bool bLast, //< indicates if the coefficient is the last significant
	2826	Bool useLimitedPrefixLength, //<
	2827	const Int maxLog2TrDynamicRange //<
	2828	) const
[2]	2829	{
[1313]	2830	Double dCurrCostSig = 0;
[2]	2831	UInt uiBestAbsLevel = 0;
[1313]	2832
[56]	2833	if( !bLast && uiMaxAbsLevel < 3 )
[2]	2834	{
[1313]	2835	rd64CodedCostSig = xGetRateSigCoef( 0, ui16CtxNumSig );
[56]	2836	rd64CodedCost = rd64CodedCost0 + rd64CodedCostSig;
	2837	if( uiMaxAbsLevel == 0 )
	2838	{
	2839	return uiBestAbsLevel;
	2840	}
[2]	2841	}
	2842	else
	2843	{
[56]	2844	rd64CodedCost = MAX_DOUBLE;
[2]	2845	}
	2846
[56]	2847	if( !bLast )
[2]	2848	{
[56]	2849	dCurrCostSig = xGetRateSigCoef( 1, ui16CtxNumSig );
[2]	2850	}
	2851
[56]	2852	UInt uiMinAbsLevel = ( uiMaxAbsLevel > 1 ? uiMaxAbsLevel - 1 : 1 );
	2853	for( Int uiAbsLevel = uiMaxAbsLevel; uiAbsLevel >= uiMinAbsLevel ; uiAbsLevel-- )
[2]	2854	{
[1313]	2855	Double dErr = Double( lLevelDouble - ( Intermediate_Int(uiAbsLevel) << iQBits ) );
	2856	Double dCurrCost = dErr * dErr * errorScale + xGetICost( xGetICRate( uiAbsLevel, ui16CtxNumOne, ui16CtxNumAbs, ui16AbsGoRice, c1Idx, c2Idx, useLimitedPrefixLength, maxLog2TrDynamicRange ) );
[56]	2857	dCurrCost += dCurrCostSig;
[2]	2858
	2859	if( dCurrCost < rd64CodedCost )
	2860	{
[56]	2861	uiBestAbsLevel = uiAbsLevel;
	2862	rd64CodedCost = dCurrCost;
	2863	rd64CodedCostSig = dCurrCostSig;
[2]	2864	}
	2865	}
[56]	2866
[2]	2867	return uiBestAbsLevel;
	2868	}
	2869
	2870	/** Calculates the cost for specific absolute transform level
	2871	* \param uiAbsLevel scaled quantized level
	2872	* \param ui16CtxNumOne current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
	2873	* \param ui16CtxNumAbs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
	2874	* \param ui16AbsGoRice Rice parameter for coeff_abs_level_minus3
[1313]	2875	* \param c1Idx
	2876	* \param c2Idx
	2877	* \param useLimitedPrefixLength
	2878	* \param maxLog2TrDynamicRange
[2]	2879	* \returns cost of given absolute transform level
	2880	*/
[1313]	2881	__inline Int TComTrQuant::xGetICRate ( const UInt uiAbsLevel,
	2882	const UShort ui16CtxNumOne,
	2883	const UShort ui16CtxNumAbs,
	2884	const UShort ui16AbsGoRice,
	2885	const UInt c1Idx,
	2886	const UInt c2Idx,
	2887	const Bool useLimitedPrefixLength,
	2888	const Int maxLog2TrDynamicRange
[56]	2889	) const
[2]	2890	{
[1313]	2891	Int iRate = Int(xGetIEPRate()); // cost of sign bit
	2892	UInt baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
[56]	2893
	2894	if ( uiAbsLevel >= baseLevel )
[1313]	2895	{
[608]	2896	UInt symbol = uiAbsLevel - baseLevel;
	2897	UInt length;
	2898	if (symbol < (COEF_REMAIN_BIN_REDUCTION << ui16AbsGoRice))
[2]	2899	{
[608]	2900	length = symbol>>ui16AbsGoRice;
	2901	iRate += (length+1+ui16AbsGoRice)<< 15;
[2]	2902	}
[1313]	2903	else if (useLimitedPrefixLength)
	2904	{
	2905	const UInt maximumPrefixLength = (32 - (COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange));
	2906
	2907	UInt prefixLength = 0;
	2908	UInt suffix = (symbol >> ui16AbsGoRice) - COEF_REMAIN_BIN_REDUCTION;
	2909
	2910	while ((prefixLength < maximumPrefixLength) && (suffix > ((2 << prefixLength) - 2)))
	2911	{
	2912	prefixLength++;
	2913	}
	2914
	2915	const UInt suffixLength = (prefixLength == maximumPrefixLength) ? (maxLog2TrDynamicRange - ui16AbsGoRice) : (prefixLength + 1/separator/);
	2916
	2917	iRate += (COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ui16AbsGoRice) << 15;
	2918	}
[608]	2919	else
	2920	{
	2921	length = ui16AbsGoRice;
	2922	symbol = symbol - ( COEF_REMAIN_BIN_REDUCTION << ui16AbsGoRice);
	2923	while (symbol >= (1<<length))
	2924	{
[1313]	2925	symbol -= (1<<(length++));
[608]	2926	}
	2927	iRate += (COEF_REMAIN_BIN_REDUCTION+length+1-ui16AbsGoRice+length)<< 15;
	2928	}
[1313]	2929
[56]	2930	if (c1Idx < C1FLAG_NUMBER)
	2931	{
	2932	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 1 ];
	2933
	2934	if (c2Idx < C2FLAG_NUMBER)
	2935	{
	2936	iRate += m_pcEstBitsSbac->m_levelAbsBits[ ui16CtxNumAbs ][ 1 ];
	2937	}
	2938	}
[2]	2939	}
[1313]	2940	else if( uiAbsLevel == 1 )
[2]	2941	{
[56]	2942	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 0 ];
[2]	2943	}
	2944	else if( uiAbsLevel == 2 )
	2945	{
[56]	2946	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 1 ];
	2947	iRate += m_pcEstBitsSbac->m_levelAbsBits[ ui16CtxNumAbs ][ 0 ];
[2]	2948	}
	2949	else
	2950	{
[872]	2951	iRate = 0;
[2]	2952	}
[1313]	2953
	2954	return iRate;
[2]	2955	}
	2956
[56]	2957	__inline Double TComTrQuant::xGetRateSigCoeffGroup ( UShort uiSignificanceCoeffGroup,
	2958	UShort ui16CtxNumSig ) const
	2959	{
	2960	return xGetICost( m_pcEstBitsSbac->significantCoeffGroupBits[ ui16CtxNumSig ][ uiSignificanceCoeffGroup ] );
	2961	}
	2962
[2]	2963	/** Calculates the cost of signaling the last significant coefficient in the block
	2964	* \param uiPosX X coordinate of the last significant coefficient
	2965	* \param uiPosY Y coordinate of the last significant coefficient
[1313]	2966	* \param component colour component ID
[2]	2967	* \returns cost of last significant coefficient
	2968	*/
[56]	2969	/*
	2970	* \param uiWidth width of the transform unit (TU)
	2971	*/
	2972	__inline Double TComTrQuant::xGetRateLast ( const UInt uiPosX,
[1313]	2973	const UInt uiPosY,
	2974	const ComponentID component ) const
[2]	2975	{
[56]	2976	UInt uiCtxX = g_uiGroupIdx[uiPosX];
	2977	UInt uiCtxY = g_uiGroupIdx[uiPosY];
[1313]	2978
	2979	Double uiCost = m_pcEstBitsSbac->lastXBits[toChannelType(component)][ uiCtxX ] + m_pcEstBitsSbac->lastYBits[toChannelType(component)][ uiCtxY ];
	2980
[56]	2981	if( uiCtxX > 3 )
	2982	{
	2983	uiCost += xGetIEPRate() * ((uiCtxX-2)>>1);
	2984	}
	2985	if( uiCtxY > 3 )
	2986	{
	2987	uiCost += xGetIEPRate() * ((uiCtxY-2)>>1);
	2988	}
	2989	return xGetICost( uiCost );
[2]	2990	}
	2991
	2992	__inline Double TComTrQuant::xGetRateSigCoef ( UShort uiSignificance,
	2993	UShort ui16CtxNumSig ) const
	2994	{
	2995	return xGetICost( m_pcEstBitsSbac->significantBits[ ui16CtxNumSig ][ uiSignificance ] );
	2996	}
	2997
	2998	/** Get the cost for a specific rate
	2999	* \param dRate rate of a bit
	3000	* \returns cost at the specific rate
	3001	*/
	3002	__inline Double TComTrQuant::xGetICost ( Double dRate ) const
	3003	{
	3004	return m_dLambda * dRate;
	3005	}
	3006
	3007	/** Get the cost of an equal probable bit
	3008	* \returns cost of equal probable bit
	3009	*/
	3010	__inline Double TComTrQuant::xGetIEPRate ( ) const
	3011	{
	3012	return 32768;
	3013	}
[56]	3014
	3015	/** Context derivation process of coeff_abs_significant_flag
	3016	* \param uiSigCoeffGroupFlag significance map of L1
[1313]	3017	* \param uiCGPosX column of current scan position
	3018	* \param uiCGPosY row of current scan position
	3019	* \param widthInGroups width of the block
	3020	* \param heightInGroups height of the block
[56]	3021	* \returns ctxInc for current scan position
	3022	*/
[1313]	3023	UInt TComTrQuant::getSigCoeffGroupCtxInc (const UInt* uiSigCoeffGroupFlag,
	3024	const UInt uiCGPosX,
	3025	const UInt uiCGPosY,
	3026	const UInt widthInGroups,
	3027	const UInt heightInGroups)
[56]	3028	{
[1313]	3029	UInt sigRight = 0;
	3030	UInt sigLower = 0;
[56]	3031
[1313]	3032	if (uiCGPosX < (widthInGroups - 1))
[56]	3033	{
[1313]	3034	sigRight = ((uiSigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
[56]	3035	}
[1313]	3036	if (uiCGPosY < (heightInGroups - 1))
[56]	3037	{
[1313]	3038	sigLower = ((uiSigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
[56]	3039	}
	3040
[1313]	3041	return ((sigRight + sigLower) != 0) ? 1 : 0;
[56]	3042	}
[1313]	3043
	3044
[56]	3045	/** set quantized matrix coefficient for encode
[1313]	3046	* \param scalingList quantized matrix address
	3047	* \param format chroma format
	3048	* \param maxLog2TrDynamicRange
	3049	* \param bitDepths reference to bit depth array for all channels
[56]	3050	*/
[1313]	3051	Void TComTrQuant::setScalingList(TComScalingList *scalingList, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
[56]	3052	{
[1313]	3053	const Int minimumQp = 0;
	3054	const Int maximumQp = SCALING_LIST_REM_NUM;
[56]	3055
[1313]	3056	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
[56]	3057	{
[1313]	3058	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
[56]	3059	{
[1313]	3060	for(Int qp = minimumQp; qp < maximumQp; qp++)
[56]	3061	{
	3062	xSetScalingListEnc(scalingList,list,size,qp);
[1313]	3063	xSetScalingListDec(*scalingList,list,size,qp);
	3064	setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
[56]	3065	}
	3066	}
	3067	}
	3068	}
	3069	/** set quantized matrix coefficient for decode
[1313]	3070	* \param scalingList quantized matrix address
	3071	* \param format chroma format
[56]	3072	*/
[1313]	3073	Void TComTrQuant::setScalingListDec(const TComScalingList &scalingList)
[56]	3074	{
[1313]	3075	const Int minimumQp = 0;
	3076	const Int maximumQp = SCALING_LIST_REM_NUM;
[56]	3077
[1313]	3078	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
[56]	3079	{
[1313]	3080	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
[56]	3081	{
[1313]	3082	for(Int qp = minimumQp; qp < maximumQp; qp++)
[56]	3083	{
	3084	xSetScalingListDec(scalingList,list,size,qp);
	3085	}
	3086	}
	3087	}
	3088	}
	3089	/** set error scale coefficients
[1313]	3090	* \param list list ID
	3091	* \param size
	3092	* \param qp quantization parameter
	3093	* \param maxLog2TrDynamicRange
	3094	* \param bitDepths reference to bit depth array for all channels
[56]	3095	*/
[1313]	3096	Void TComTrQuant::setErrScaleCoeff(UInt list, UInt size, Int qp, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
[56]	3097	{
[1313]	3098	const UInt uiLog2TrSize = g_aucConvertToBit[ g_scalingListSizeX[size] ] + 2;
	3099	const ChannelType channelType = ((list == 0) \|\| (list == MAX_NUM_COMPONENT)) ? CHANNEL_TYPE_LUMA : CHANNEL_TYPE_CHROMA;
[56]	3100
[1313]	3101	const Int channelBitDepth = bitDepths.recon[channelType];
	3102	const Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange[channelType]); // Represents scaling through forward transform
[56]	3103
	3104	UInt i,uiMaxNumCoeff = g_scalingListSize[size];
	3105	Int *piQuantcoeff;
[608]	3106	Double *pdErrScale;
	3107	piQuantcoeff = getQuantCoeff(list, qp,size);
	3108	pdErrScale = getErrScaleCoeff(list, size, qp);
[56]	3109
[1313]	3110	Double dErrScale = (Double)(1<<SCALE_BITS); // Compensate for scaling of bitcount in Lagrange cost function
	3111	dErrScale = dErrScalepow(2.0,(-2.0iTransformShift)); // Compensate for scaling through forward transform
	3112
[56]	3113	for(i=0;i<uiMaxNumCoeff;i++)
	3114	{
[1313]	3115	pdErrScale[i] = dErrScale / piQuantcoeff[i] / piQuantcoeff[i] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
[56]	3116	}
[1313]	3117
	3118	getErrScaleCoeffNoScalingList(list, size, qp) = dErrScale / g_quantScales[qp] / g_quantScales[qp] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
[56]	3119	}
	3120
	3121	/** set quantized matrix coefficient for encode
[1313]	3122	* \param scalingList quantized matrix address
[56]	3123	* \param listId List index
	3124	* \param sizeId size index
[1313]	3125	* \param qp Quantization parameter
	3126	* \param format chroma format
[56]	3127	*/
[1313]	3128	Void TComTrQuant::xSetScalingListEnc(TComScalingList *scalingList, UInt listId, UInt sizeId, Int qp)
[56]	3129	{
[1313]	3130	UInt width = g_scalingListSizeX[sizeId];
[56]	3131	UInt height = g_scalingListSizeX[sizeId];
[1313]	3132	UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
[56]	3133	Int *quantcoeff;
[1313]	3134	Int *coeff = scalingList->getScalingListAddress(sizeId,listId);
	3135	quantcoeff = getQuantCoeff(listId, qp, sizeId);
[56]	3136
[1313]	3137	Int quantScales = g_quantScales[qp];
	3138
	3139	processScalingListEnc(coeff,
	3140	quantcoeff,
	3141	(quantScales << LOG2_SCALING_LIST_NEUTRAL_VALUE),
	3142	height, width, ratio,
	3143	min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
	3144	scalingList->getScalingListDC(sizeId,listId));
[56]	3145	}
[1313]	3146
[56]	3147	/** set quantized matrix coefficient for decode
	3148	* \param scalingList quantaized matrix address
[1313]	3149	* \param listId List index
	3150	* \param sizeId size index
	3151	* \param qp Quantization parameter
	3152	* \param format chroma format
[56]	3153	*/
[1313]	3154	Void TComTrQuant::xSetScalingListDec(const TComScalingList &scalingList, UInt listId, UInt sizeId, Int qp)
[56]	3155	{
[1313]	3156	UInt width = g_scalingListSizeX[sizeId];
[56]	3157	UInt height = g_scalingListSizeX[sizeId];
[1313]	3158	UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
[56]	3159	Int *dequantcoeff;
[1313]	3160	const Int *coeff = scalingList.getScalingListAddress(sizeId,listId);
[56]	3161
[608]	3162	dequantcoeff = getDequantCoeff(listId, qp, sizeId);
[1313]	3163
	3164	Int invQuantScale = g_invQuantScales[qp];
	3165
	3166	processScalingListDec(coeff,
	3167	dequantcoeff,
	3168	invQuantScale,
	3169	height, width, ratio,
	3170	min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
	3171	scalingList.getScalingListDC(sizeId,listId));
[56]	3172	}
	3173
	3174	/** set flat matrix value to quantized coefficient
	3175	*/
[1313]	3176	Void TComTrQuant::setFlatScalingList(const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
[56]	3177	{
[1313]	3178	const Int minimumQp = 0;
	3179	const Int maximumQp = SCALING_LIST_REM_NUM;
[56]	3180
[1313]	3181	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
[56]	3182	{
[1313]	3183	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
[56]	3184	{
[1313]	3185	for(Int qp = minimumQp; qp < maximumQp; qp++)
[56]	3186	{
	3187	xsetFlatScalingList(list,size,qp);
[1313]	3188	setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
[56]	3189	}
	3190	}
	3191	}
	3192	}
	3193
	3194	/** set flat matrix value to quantized coefficient
	3195	* \param list List ID
[1313]	3196	* \param size size index
	3197	* \param qp Quantization parameter
	3198	* \param format chroma format
[56]	3199	*/
[1313]	3200	Void TComTrQuant::xsetFlatScalingList(UInt list, UInt size, Int qp)
[56]	3201	{
	3202	UInt i,num = g_scalingListSize[size];
	3203	Int *quantcoeff;
	3204	Int *dequantcoeff;
	3205
[1313]	3206	Int quantScales = g_quantScales [qp];
	3207	Int invQuantScales = g_invQuantScales[qp] << 4;
	3208
[608]	3209	quantcoeff = getQuantCoeff(list, qp, size);
	3210	dequantcoeff = getDequantCoeff(list, qp, size);
[56]	3211
	3212	for(i=0;i<num;i++)
[1313]	3213	{
[56]	3214	*quantcoeff++ = quantScales;
	3215	*dequantcoeff++ = invQuantScales;
	3216	}
	3217	}
	3218
	3219	/** set quantized matrix coefficient for encode
	3220	* \param coeff quantaized matrix address
	3221	* \param quantcoeff quantaized matrix address
	3222	* \param quantScales Q(QP%6)
	3223	* \param height height
	3224	* \param width width
	3225	* \param ratio ratio for upscale
	3226	* \param sizuNum matrix size
	3227	* \param dc dc parameter
	3228	*/
	3229	Void TComTrQuant::processScalingListEnc( Int coeff, Int quantcoeff, Int quantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
	3230	{
	3231	for(UInt j=0;j<height;j++)
	3232	{
	3233	for(UInt i=0;i<width;i++)
	3234	{
[1313]	3235	quantcoeff[jwidth + i] = quantScales / coeff[sizuNum (j / ratio) + i / ratio];
[56]	3236	}
	3237	}
[1313]	3238
[56]	3239	if(ratio > 1)
	3240	{
	3241	quantcoeff[0] = quantScales / dc;
	3242	}
	3243	}
[1313]	3244
[56]	3245	/** set quantized matrix coefficient for decode
	3246	* \param coeff quantaized matrix address
	3247	* \param dequantcoeff quantaized matrix address
	3248	* \param invQuantScales IQ(QP%6))
	3249	* \param height height
	3250	* \param width width
	3251	* \param ratio ratio for upscale
	3252	* \param sizuNum matrix size
	3253	* \param dc dc parameter
	3254	*/
[1313]	3255	Void TComTrQuant::processScalingListDec( const Int coeff, Int dequantcoeff, Int invQuantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
[56]	3256	{
	3257	for(UInt j=0;j<height;j++)
	3258	{
	3259	for(UInt i=0;i<width;i++)
	3260	{
[608]	3261	dequantcoeff[jwidth + i] = invQuantScales coeff[sizuNum * (j / ratio) + i / ratio];
[56]	3262	}
	3263	}
[1313]	3264
[56]	3265	if(ratio > 1)
	3266	{
	3267	dequantcoeff[0] = invQuantScales * dc;
	3268	}
	3269	}
	3270
	3271	/** initialization process of scaling list array
	3272	*/
	3273	Void TComTrQuant::initScalingList()
	3274	{
	3275	for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
	3276	{
[1313]	3277	for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
[56]	3278	{
[1313]	3279	for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
[56]	3280	{
[1313]	3281	m_quantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
	3282	m_dequantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
[608]	3283	m_errScale [sizeId][listId][qp] = new Double [g_scalingListSize[sizeId]];
[1313]	3284	} // listID loop
[56]	3285	}
	3286	}
	3287	}
[1313]	3288
[56]	3289	/** destroy quantization matrix array
	3290	*/
	3291	Void TComTrQuant::destroyScalingList()
	3292	{
	3293	for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
	3294	{
[1313]	3295	for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
[56]	3296	{
	3297	for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
	3298	{
[1313]	3299	if(m_quantCoef[sizeId][listId][qp])
	3300	{
	3301	delete [] m_quantCoef[sizeId][listId][qp];
	3302	}
	3303	if(m_dequantCoef[sizeId][listId][qp])
	3304	{
	3305	delete [] m_dequantCoef[sizeId][listId][qp];
	3306	}
	3307	if(m_errScale[sizeId][listId][qp])
	3308	{
	3309	delete [] m_errScale[sizeId][listId][qp];
	3310	}
[56]	3311	}
	3312	}
	3313	}
	3314	}
	3315
[1313]	3316	Void TComTrQuant::transformSkipQuantOneSample(TComTU &rTu, const ComponentID compID, const TCoeff resiDiff, TCoeff* pcCoeff, const UInt uiPos, const QpParam &cQP, const Bool bUseHalfRoundingPoint)
	3317	{
	3318	TComDataCU *pcCU = rTu.getCU();
	3319	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	3320	const TComRectangle &rect = rTu.getRect(compID);
	3321	const UInt uiWidth = rect.width;
	3322	const UInt uiHeight = rect.height;
	3323	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	3324	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	3325	const Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
	3326	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	3327	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
	3328	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
	3329
	3330	assert( scalingListType < SCALING_LIST_NUM );
	3331	const Int *const piQuantCoeff = getQuantCoeff( scalingListType, cQP.rem, (rTu.GetEquivalentLog2TrSize(compID)-2) );
	3332
	3333
	3334	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
	3335	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
	3336	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
	3337	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
	3338	*/
	3339
	3340	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
	3341	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
	3342
	3343	const Int iAdd = ( bUseHalfRoundingPoint ? 256 : (pcCU->getSlice()->getSliceType() == I_SLICE ? 171 : 85) ) << (iQBits - 9);
	3344
	3345	TCoeff transformedCoefficient;
	3346
	3347	// transform-skip
	3348	if (iTransformShift >= 0)
	3349	{
	3350	transformedCoefficient = resiDiff << iTransformShift;
	3351	}
	3352	else // for very high bit depths
	3353	{
	3354	const Int iTrShiftNeg = -iTransformShift;
	3355	const Int offset = 1 << (iTrShiftNeg - 1);
	3356	transformedCoefficient = ( resiDiff + offset ) >> iTrShiftNeg;
	3357	}
	3358
	3359	// quantization
	3360	const TCoeff iSign = (transformedCoefficient < 0 ? -1: 1);
	3361
	3362	const Int quantisationCoefficient = enableScalingLists ? piQuantCoeff[uiPos] : defaultQuantisationCoefficient;
	3363
	3364	const Int64 tmpLevel = (Int64)abs(transformedCoefficient) * quantisationCoefficient;
	3365
	3366	const TCoeff quantisedCoefficient = (TCoeff((tmpLevel + iAdd ) >> iQBits)) * iSign;
	3367
	3368	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
	3369	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
	3370	pcCoeff[ uiPos ] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
	3371	}
	3372
	3373
	3374	Void TComTrQuant::invTrSkipDeQuantOneSample( TComTU &rTu, ComponentID compID, TCoeff inSample, Pel &reconSample, const QpParam &cQP, UInt uiPos )
	3375	{
	3376	TComDataCU *pcCU = rTu.getCU();
	3377	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	3378	const TComRectangle &rect = rTu.getRect(compID);
	3379	const UInt uiWidth = rect.width;
	3380	const UInt uiHeight = rect.height;
	3381	const Int QP_per = cQP.per;
	3382	const Int QP_rem = cQP.rem;
	3383	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	3384	#if O0043_BEST_EFFORT_DECODING
	3385	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
	3386	#else
	3387	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	3388	#endif
	3389	const Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
	3390	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	3391	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
	3392	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
	3393
	3394	assert( scalingListType < SCALING_LIST_NUM );
	3395
	3396	const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
	3397
	3398	const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
	3399	const TCoeff transformMaximum = (1 << maxLog2TrDynamicRange) - 1;
	3400
	3401	// Dequantisation
	3402
	3403	TCoeff dequantisedSample;
	3404
	3405	if(enableScalingLists)
	3406	{
	3407	const UInt dequantCoefBits = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
	3408	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
	3409
	3410	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
	3411	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
	3412
	3413	Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
	3414
	3415	if(rightShift > 0)
	3416	{
	3417	const Intermediate_Int iAdd = 1 << (rightShift - 1);
	3418	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
	3419	const Intermediate_Int iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) + iAdd ) >> rightShift;
	3420
	3421	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	3422	}
	3423	else
	3424	{
	3425	const Int leftShift = -rightShift;
	3426	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
	3427	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) << leftShift;
	3428
	3429	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	3430	}
	3431	}
	3432	else
	3433	{
	3434	const Int scale = g_invQuantScales[QP_rem];
	3435	const Int scaleBits = (IQUANT_SHIFT + 1) ;
	3436
	3437	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
	3438	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
	3439	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
	3440
	3441	if (rightShift > 0)
	3442	{
	3443	const Intermediate_Int iAdd = 1 << (rightShift - 1);
	3444	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
	3445	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
	3446
	3447	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	3448	}
	3449	else
	3450	{
	3451	const Int leftShift = -rightShift;
	3452	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
	3453	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale) << leftShift;
	3454
	3455	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	3456	}
	3457	}
	3458
	3459	// Inverse transform-skip
	3460
	3461	if (iTransformShift >= 0)
	3462	{
	3463	const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
	3464	reconSample = Pel(( dequantisedSample + offset ) >> iTransformShift);
	3465	}
	3466	else //for very high bit depths
	3467	{
	3468	const Int iTrShiftNeg = -iTransformShift;
	3469	reconSample = Pel(dequantisedSample << iTrShiftNeg);
	3470	}
	3471	}
	3472
	3473
	3474	Void TComTrQuant::crossComponentPrediction( TComTU & rTu,
	3475	const ComponentID compID,
	3476	const Pel * piResiL,
	3477	const Pel * piResiC,
	3478	Pel * piResiT,
	3479	const Int width,
	3480	const Int height,
	3481	const Int strideL,
	3482	const Int strideC,
	3483	const Int strideT,
	3484	const Bool reverse )
	3485	{
	3486	const Pel *pResiL = piResiL;
	3487	const Pel *pResiC = piResiC;
	3488	Pel *pResiT = piResiT;
	3489
	3490	TComDataCU *pCU = rTu.getCU();
	3491	const Int alpha = pCU->getCrossComponentPredictionAlpha( rTu.GetAbsPartIdxTU( compID ), compID );
	3492	const Int diffBitDepth = pCU->getSlice()->getSPS()->getDifferentialLumaChromaBitDepth();
	3493
	3494	for( Int y = 0; y < height; y++ )
	3495	{
	3496	if (reverse)
	3497	{
	3498	// A constraint is to be added to the HEVC Standard to limit the size of pResiL and pResiC at this point.
	3499	// The likely form of the constraint is to either restrict the values to CoeffMin to CoeffMax,
	3500	// or to be representable in a bitDepthY+4 or bitDepthC+4 signed integer.
	3501	// The result of the constraint is that for 8/10/12bit profiles, the input values
	3502	// can be represented within a 16-bit Pel-type.
	3503	#if RExt__HIGH_BIT_DEPTH_SUPPORT
	3504	for( Int x = 0; x < width; x++ )
	3505	{
	3506	pResiT[x] = pResiC[x] + (( alpha * rightShift( pResiL[x], diffBitDepth) ) >> 3);
	3507	}
	3508	#else
	3509	const Int minPel=std::numeric_limits<Pel>::min();
	3510	const Int maxPel=std::numeric_limits<Pel>::max();
	3511	for( Int x = 0; x < width; x++ )
	3512	{
	3513	pResiT[x] = Clip3<Int>(minPel, maxPel, pResiC[x] + (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3));
	3514	}
	3515	#endif
	3516	}
	3517	else
	3518	{
	3519	// Forward does not need clipping. Pel type should always be big enough.
	3520	for( Int x = 0; x < width; x++ )
	3521	{
	3522	pResiT[x] = pResiC[x] - (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3);
	3523	}
	3524	}
	3525
	3526	pResiL += strideL;
	3527	pResiC += strideC;
	3528	pResiT += strideT;
	3529	}
	3530	}
	3531
[56]	3532	//! \}

Note: See TracBrowser for help on using the repository browser.

JCT-3V 3D-HEVC

Context navigation

source: 3DVCSoftware/trunk/source/Lib/TLibCommon/TComTrQuant.cpp @ 1417

Download in other formats: