Context navigation

source: 3DVCSoftware/trunk/source/Lib/TLibCommon/TComTrQuant.cpp @ 1327

Visit:

Last change on this file since 1327 was 1313, checked in by tech, 9 years ago
Merged 14.1-update-dev1@1312.
Property svn:eol-style set to `native`
File size: 130.4 KB

Rev	Line
[5]	1	/* The copyright in this software is being made available under the BSD
	2	* License, included below. This software may be subject to other third party
	3	* and contributor rights, including patent rights, and no such rights are
[1313]	4	* granted under this license.
[5]	5	*
[1313]	6	* Copyright (c) 2010-2015, ITU/ISO/IEC
[5]	7	* All rights reserved.
	8	*
	9	* Redistribution and use in source and binary forms, with or without
	10	* modification, are permitted provided that the following conditions are met:
	11	*
	12	* * Redistributions of source code must retain the above copyright notice,
	13	* this list of conditions and the following disclaimer.
	14	* * Redistributions in binary form must reproduce the above copyright notice,
	15	* this list of conditions and the following disclaimer in the documentation
	16	* and/or other materials provided with the distribution.
[56]	17	* * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
[5]	18	* be used to endorse or promote products derived from this software without
	19	* specific prior written permission.
	20	*
	21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
	22	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
	23	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
	24	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
	25	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
	26	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
	27	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
	28	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
	29	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
	30	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
	31	* THE POSSIBILITY OF SUCH DAMAGE.
	32	*/
[2]	33
	34	/** \file TComTrQuant.cpp
	35	\brief transform and quantization class
	36	*/
	37
	38	#include <stdlib.h>
	39	#include <math.h>
[1313]	40	#include <limits>
[2]	41	#include <memory.h>
	42	#include "TComTrQuant.h"
	43	#include "TComPic.h"
	44	#include "ContextTables.h"
[1313]	45	#include "TComTU.h"
	46	#include "Debug.h"
[2]	47
[56]	48	typedef struct
	49	{
	50	Int iNNZbeforePos0;
	51	Double d64CodedLevelandDist; // distortion and level cost only
	52	Double d64UncodedDist; // all zero coded block distortion
	53	Double d64SigCost;
	54	Double d64SigCost_0;
	55	} coeffGroupRDStats;
	56
	57	//! \ingroup TLibCommon
	58	//! \{
	59
[2]	60	// ====================================================================================================================
	61	// Constants
	62	// ====================================================================================================================
	63
	64	#define RDOQ_CHROMA 1 ///< use of RDOQ in chroma
	65
[1313]	66
[2]	67	// ====================================================================================================================
[1313]	68	// QpParam constructor
[2]	69	// ====================================================================================================================
	70
[1313]	71	QpParam::QpParam(const Int qpy,
	72	const ChannelType chType,
	73	const Int qpBdOffset,
	74	const Int chromaQPOffset,
	75	const ChromaFormat chFmt )
	76	{
	77	Int baseQp;
[2]	78
[1313]	79	if(isLuma(chType))
	80	{
	81	baseQp = qpy + qpBdOffset;
	82	}
	83	else
	84	{
	85	baseQp = Clip3( -qpBdOffset, (chromaQPMappingTableSize - 1), qpy + chromaQPOffset );
[2]	86
[1313]	87	if(baseQp < 0)
	88	{
	89	baseQp = baseQp + qpBdOffset;
	90	}
	91	else
	92	{
	93	baseQp = getScaledChromaQP(baseQp, chFmt) + qpBdOffset;
	94	}
	95	}
	96
	97	Qp =baseQp;
	98	per=baseQp/6;
	99	rem=baseQp%6;
	100	}
	101
	102	QpParam::QpParam(const TComDataCU &cu, const ComponentID compID)
[2]	103	{
[1313]	104	Int chromaQpOffset = 0;
	105
	106	if (isChroma(compID))
	107	{
	108	chromaQpOffset += cu.getSlice()->getPPS()->getQpOffset(compID);
	109	chromaQpOffset += cu.getSlice()->getSliceChromaQpDelta(compID);
	110
	111	chromaQpOffset += cu.getSlice()->getPPS()->getPpsRangeExtension().getChromaQpOffsetListEntry(cu.getChromaQpAdj(0)).u.offset[Int(compID)-1];
	112	}
	113
	114	*this = QpParam(cu.getQP( 0 ),
	115	toChannelType(compID),
	116	cu.getSlice()->getSPS()->getQpBDOffset(toChannelType(compID)),
	117	chromaQpOffset,
	118	cu.getPic()->getChromaFormat());
[2]	119	}
	120
[1313]	121
[2]	122	// ====================================================================================================================
	123	// TComTrQuant class member functions
	124	// ====================================================================================================================
	125
	126	TComTrQuant::TComTrQuant()
	127	{
	128	// allocate temporary buffers
[1313]	129	m_plTempCoeff = new TCoeff[ MAX_CU_SIZE*MAX_CU_SIZE ];
	130
[2]	131	// allocate bit estimation class (for RDOQ)
	132	m_pcEstBitsSbac = new estBitsSbacStruct;
[56]	133	initScalingList();
[2]	134	}
	135
	136	TComTrQuant::~TComTrQuant()
	137	{
	138	// delete temporary buffers
	139	if ( m_plTempCoeff )
	140	{
	141	delete [] m_plTempCoeff;
	142	m_plTempCoeff = NULL;
	143	}
[1313]	144
[2]	145	// delete bit estimation class
[56]	146	if ( m_pcEstBitsSbac )
	147	{
	148	delete m_pcEstBitsSbac;
	149	}
	150	destroyScalingList();
[2]	151	}
	152
[56]	153	#if ADAPTIVE_QP_SELECTION
	154	Void TComTrQuant::storeSliceQpNext(TComSlice* pcSlice)
	155	{
[1313]	156	// NOTE: does this work with negative QPs or when some blocks are transquant-bypass enabled?
	157
[56]	158	Int qpBase = pcSlice->getSliceQpBase();
	159	Int sliceQpused = pcSlice->getSliceQp();
	160	Int sliceQpnext;
	161	Double alpha = qpBase < 17 ? 0.5 : 1;
[1313]	162
[56]	163	Int cnt=0;
[608]	164	for(Int u=1; u<=LEVEL_RANGE; u++)
[1313]	165	{
[56]	166	cnt += m_sliceNsamples[u] ;
	167	}
	168
[608]	169	if( !m_useRDOQ )
[56]	170	{
	171	sliceQpused = qpBase;
	172	alpha = 0.5;
	173	}
	174
	175	if( cnt > 120 )
	176	{
	177	Double sum = 0;
	178	Int k = 0;
	179	for(Int u=1; u<LEVEL_RANGE; u++)
	180	{
	181	sum += u*m_sliceSumC[u];
	182	k += uum_sliceNsamples[u];
	183	}
	184
	185	Int v;
	186	Double q[MAX_QP+1] ;
	187	for(v=0; v<=MAX_QP; v++)
	188	{
	189	q[v] = (Double)(g_invQuantScales[v%6] * (1<<(v/6)))/64 ;
	190	}
	191
	192	Double qnext = sum/k * q[sliceQpused] / (1<<ARL_C_PRECISION);
	193
	194	for(v=0; v<MAX_QP; v++)
	195	{
	196	if(qnext < alpha * q[v] + (1 - alpha) * q[v+1] )
	197	{
	198	break;
	199	}
	200	}
	201	sliceQpnext = Clip3(sliceQpused - 3, sliceQpused + 3, v);
	202	}
	203	else
	204	{
	205	sliceQpnext = sliceQpused;
	206	}
	207
[1313]	208	m_qpDelta[qpBase] = sliceQpnext - qpBase;
[56]	209	}
	210
	211	Void TComTrQuant::initSliceQpDelta()
	212	{
	213	for(Int qp=0; qp<=MAX_QP; qp++)
	214	{
	215	m_qpDelta[qp] = qp < 17 ? 0 : 1;
	216	}
	217	}
	218
	219	Void TComTrQuant::clearSliceARLCnt()
[1313]	220	{
[56]	221	memset(m_sliceSumC, 0, sizeof(Double)*(LEVEL_RANGE+1));
	222	memset(m_sliceNsamples, 0, sizeof(Int)*(LEVEL_RANGE+1));
	223	}
	224	#endif
	225
	226
	227
[2]	228	#if MATRIX_MULT
	229	/** NxN forward transform (2D) using brute force matrix multiplication (3 nested loops)
	230	* \param block pointer to input data (residual)
	231	* \param coeff pointer to output data (transform coefficients)
	232	* \param uiStride stride of input data
	233	* \param uiTrSize transform size (uiTrSize x uiTrSize)
	234	* \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
	235	*/
[1313]	236	Void xTr(Int bitDepth, Pel block, TCoeff coeff, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
[2]	237	{
[1313]	238	UInt i,j,k;
	239	TCoeff iSum;
	240	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
	241	const TMatrixCoeff *iT;
[2]	242	UInt uiLog2TrSize = g_aucConvertToBit[ uiTrSize ] + 2;
	243
	244	if (uiTrSize==4)
	245	{
[1313]	246	iT = (useDST ? g_as_DST_MAT_4[TRANSFORM_FORWARD][0] : g_aiT4[TRANSFORM_FORWARD][0]);
[2]	247	}
	248	else if (uiTrSize==8)
	249	{
[1313]	250	iT = g_aiT8[TRANSFORM_FORWARD][0];
[2]	251	}
	252	else if (uiTrSize==16)
	253	{
[1313]	254	iT = g_aiT16[TRANSFORM_FORWARD][0];
[2]	255	}
	256	else if (uiTrSize==32)
	257	{
[1313]	258	iT = g_aiT32[TRANSFORM_FORWARD][0];
[2]	259	}
[56]	260	else
	261	{
[2]	262	assert(0);
	263	}
	264
[1313]	265	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
[2]	266
[1313]	267	const Int shift_1st = (uiLog2TrSize + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
	268	const Int shift_2nd = uiLog2TrSize + TRANSFORM_MATRIX_SHIFT;
	269	const Int add_1st = (shift_1st>0) ? (1<<(shift_1st-1)) : 0;
	270	const Int add_2nd = 1<<(shift_2nd-1);
	271
[2]	272	/* Horizontal transform */
	273
	274	for (i=0; i<uiTrSize; i++)
	275	{
	276	for (j=0; j<uiTrSize; j++)
	277	{
	278	iSum = 0;
	279	for (k=0; k<uiTrSize; k++)
	280	{
	281	iSum += iT[iuiTrSize+k]block[j*uiStride+k];
	282	}
	283	tmp[i*uiTrSize+j] = (iSum + add_1st)>>shift_1st;
	284	}
	285	}
[1313]	286
[56]	287	/* Vertical transform */
[1313]	288	for (i=0; i<uiTrSize; i++)
[2]	289	{
	290	for (j=0; j<uiTrSize; j++)
	291	{
	292	iSum = 0;
	293	for (k=0; k<uiTrSize; k++)
	294	{
[1313]	295	iSum += iT[iuiTrSize+k]tmp[j*uiTrSize+k];
[2]	296	}
[1313]	297	coeff[i*uiTrSize+j] = (iSum + add_2nd)>>shift_2nd;
[2]	298	}
[56]	299	}
[2]	300	}
	301
	302	/** NxN inverse transform (2D) using brute force matrix multiplication (3 nested loops)
	303	* \param coeff pointer to input data (transform coefficients)
	304	* \param block pointer to output data (residual)
	305	* \param uiStride stride of output data
	306	* \param uiTrSize transform size (uiTrSize x uiTrSize)
	307	* \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
	308	*/
[1313]	309	Void xITr(Int bitDepth, TCoeff coeff, Pel block, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
[2]	310	{
[1313]	311	UInt i,j,k;
	312	TCoeff iSum;
	313	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
	314	const TMatrixCoeff *iT;
	315
[2]	316	if (uiTrSize==4)
	317	{
[1313]	318	iT = (useDST ? g_as_DST_MAT_4[TRANSFORM_INVERSE][0] : g_aiT4[TRANSFORM_INVERSE][0]);
[2]	319	}
	320	else if (uiTrSize==8)
	321	{
[1313]	322	iT = g_aiT8[TRANSFORM_INVERSE][0];
[2]	323	}
	324	else if (uiTrSize==16)
	325	{
[1313]	326	iT = g_aiT16[TRANSFORM_INVERSE][0];
[2]	327	}
	328	else if (uiTrSize==32)
	329	{
[1313]	330	iT = g_aiT32[TRANSFORM_INVERSE][0];
[2]	331	}
[56]	332	else
	333	{
[2]	334	assert(0);
	335	}
[1313]	336
	337	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
	338
	339	const Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
	340	const Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
	341	const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
	342	const TCoeff clipMaximum = (1 << maxLog2TrDynamicRange) - 1;
	343	assert(shift_2nd>=0);
	344	const Int add_1st = 1<<(shift_1st-1);
	345	const Int add_2nd = (shift_2nd>0) ? (1<<(shift_2nd-1)) : 0;
	346
[2]	347	/* Horizontal transform */
	348	for (i=0; i<uiTrSize; i++)
[1313]	349	{
[2]	350	for (j=0; j<uiTrSize; j++)
	351	{
	352	iSum = 0;
	353	for (k=0; k<uiTrSize; k++)
[1313]	354	{
	355	iSum += iT[kuiTrSize+i]coeff[k*uiTrSize+j];
[2]	356	}
[1313]	357
	358	// Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
	359	tmp[i*uiTrSize+j] = Clip3<TCoeff>(clipMinimum, clipMaximum, (iSum + add_1st)>>shift_1st);
[2]	360	}
	361	}
[1313]	362
[2]	363	/* Vertical transform */
	364	for (i=0; i<uiTrSize; i++)
[1313]	365	{
[2]	366	for (j=0; j<uiTrSize; j++)
	367	{
	368	iSum = 0;
	369	for (k=0; k<uiTrSize; k++)
[1313]	370	{
[2]	371	iSum += iT[kuiTrSize+j]tmp[i*uiTrSize+k];
	372	}
[1313]	373
	374	block[i*uiStride+j] = Clip3<TCoeff>(std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max(), (iSum + add_2nd)>>shift_2nd);
[2]	375	}
	376	}
	377	}
	378
[1313]	379	#endif //MATRIX_MULT
[2]	380
[1313]	381
[2]	382	/** 4x4 forward transform implemented using partial butterfly structure (1D)
[56]	383	* \param src input data (residual)
	384	* \param dst output data (transform coefficients)
[2]	385	* \param shift specifies right shift after 1D transform
[1313]	386	* \param line
[2]	387	*/
[1313]	388	Void partialButterfly4(TCoeff src, TCoeff dst, Int shift, Int line)
[56]	389	{
[608]	390	Int j;
[1313]	391	TCoeff E[2],O[2];
	392	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	393
	394	for (j=0; j<line; j++)
[1313]	395	{
[56]	396	/* E and O */
	397	E[0] = src[0] + src[3];
	398	O[0] = src[0] - src[3];
	399	E[1] = src[1] + src[2];
	400	O[1] = src[1] - src[2];
	401
[1313]	402	dst[0] = (g_aiT4[TRANSFORM_FORWARD][0][0]E[0] + g_aiT4[TRANSFORM_FORWARD][0][1]E[1] + add)>>shift;
	403	dst[2line] = (g_aiT4[TRANSFORM_FORWARD][2][0]E[0] + g_aiT4[TRANSFORM_FORWARD][2][1]*E[1] + add)>>shift;
	404	dst[line] = (g_aiT4[TRANSFORM_FORWARD][1][0]O[0] + g_aiT4[TRANSFORM_FORWARD][1][1]O[1] + add)>>shift;
	405	dst[3line] = (g_aiT4[TRANSFORM_FORWARD][3][0]O[0] + g_aiT4[TRANSFORM_FORWARD][3][1]*O[1] + add)>>shift;
[56]	406
	407	src += 4;
	408	dst ++;
	409	}
	410	}
	411
[1313]	412	// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
[2]	413	// give identical results
[1313]	414	Void fastForwardDst(TCoeff block, TCoeff coeff, Int shift) // input block, output coeff
[2]	415	{
[1313]	416	Int i;
	417	TCoeff c[4];
	418	TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
[2]	419	for (i=0; i<4; i++)
	420	{
	421	// Intermediate Variables
[1313]	422	c[0] = block[4*i+0];
	423	c[1] = block[4*i+1];
	424	c[2] = block[4*i+2];
	425	c[3] = block[4*i+3];
[56]	426
[1313]	427	for (Int row = 0; row < 4; row++)
	428	{
	429	TCoeff result = 0;
	430	for (Int column = 0; column < 4; column++)
	431	{
	432	result += c[column] * g_as_DST_MAT_4[TRANSFORM_FORWARD][row][column]; // use the defined matrix, rather than hard-wired numbers
	433	}
	434
	435	coeff[(row * 4) + i] = rightShift((result + rnd_factor), shift);
	436	}
[2]	437	}
	438	}
[56]	439
[1313]	440	Void fastInverseDst(TCoeff tmp, TCoeff block, Int shift, const TCoeff outputMinimum, const TCoeff outputMaximum) // input tmp, output block
[2]	441	{
[1313]	442	Int i;
	443	TCoeff c[4];
	444	TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
[2]	445	for (i=0; i<4; i++)
[1313]	446	{
[2]	447	// Intermediate Variables
[1313]	448	c[0] = tmp[ i];
	449	c[1] = tmp[4 +i];
	450	c[2] = tmp[8 +i];
	451	c[3] = tmp[12+i];
[56]	452
[1313]	453	for (Int column = 0; column < 4; column++)
	454	{
	455	TCoeff &result = block[(i * 4) + column];
	456
	457	result = 0;
	458	for (Int row = 0; row < 4; row++)
	459	{
	460	result += c[row] * g_as_DST_MAT_4[TRANSFORM_INVERSE][row][column]; // use the defined matrix, rather than hard-wired numbers
	461	}
	462
	463	result = Clip3( outputMinimum, outputMaximum, rightShift((result + rnd_factor), shift));
	464	}
[2]	465	}
	466	}
[56]	467
[1313]	468	/** 4x4 inverse transform implemented using partial butterfly structure (1D)
	469	* \param src input data (transform coefficients)
	470	* \param dst output data (residual)
	471	* \param shift specifies right shift after 1D transform
	472	* \param line
	473	* \param outputMinimum minimum for clipping
	474	* \param outputMaximum maximum for clipping
	475	*/
	476	Void partialButterflyInverse4(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
[56]	477	{
[608]	478	Int j;
[1313]	479	TCoeff E[2],O[2];
	480	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	481
	482	for (j=0; j<line; j++)
[1313]	483	{
	484	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
	485	O[0] = g_aiT4[TRANSFORM_INVERSE][1][0]src[line] + g_aiT4[TRANSFORM_INVERSE][3][0]src[3*line];
	486	O[1] = g_aiT4[TRANSFORM_INVERSE][1][1]src[line] + g_aiT4[TRANSFORM_INVERSE][3][1]src[3*line];
	487	E[0] = g_aiT4[TRANSFORM_INVERSE][0][0]src[0] + g_aiT4[TRANSFORM_INVERSE][2][0]src[2*line];
	488	E[1] = g_aiT4[TRANSFORM_INVERSE][0][1]src[0] + g_aiT4[TRANSFORM_INVERSE][2][1]src[2*line];
[56]	489
	490	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
[1313]	491	dst[0] = Clip3( outputMinimum, outputMaximum, (E[0] + O[0] + add)>>shift );
	492	dst[1] = Clip3( outputMinimum, outputMaximum, (E[1] + O[1] + add)>>shift );
	493	dst[2] = Clip3( outputMinimum, outputMaximum, (E[1] - O[1] + add)>>shift );
	494	dst[3] = Clip3( outputMinimum, outputMaximum, (E[0] - O[0] + add)>>shift );
	495
[56]	496	src ++;
	497	dst += 4;
	498	}
	499	}
	500
[1313]	501	/** 8x8 forward transform implemented using partial butterfly structure (1D)
	502	* \param src input data (residual)
	503	* \param dst output data (transform coefficients)
	504	* \param shift specifies right shift after 1D transform
	505	* \param line
	506	*/
	507	Void partialButterfly8(TCoeff src, TCoeff dst, Int shift, Int line)
[56]	508	{
[608]	509	Int j,k;
[1313]	510	TCoeff E[4],O[4];
	511	TCoeff EE[2],EO[2];
	512	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	513
	514	for (j=0; j<line; j++)
[1313]	515	{
[56]	516	/* E and O*/
	517	for (k=0;k<4;k++)
	518	{
	519	E[k] = src[k] + src[7-k];
	520	O[k] = src[k] - src[7-k];
[1313]	521	}
[56]	522	/* EE and EO */
[1313]	523	EE[0] = E[0] + E[3];
[56]	524	EO[0] = E[0] - E[3];
	525	EE[1] = E[1] + E[2];
	526	EO[1] = E[1] - E[2];
	527
[1313]	528	dst[0] = (g_aiT8[TRANSFORM_FORWARD][0][0]EE[0] + g_aiT8[TRANSFORM_FORWARD][0][1]EE[1] + add)>>shift;
	529	dst[4line] = (g_aiT8[TRANSFORM_FORWARD][4][0]EE[0] + g_aiT8[TRANSFORM_FORWARD][4][1]*EE[1] + add)>>shift;
	530	dst[2line] = (g_aiT8[TRANSFORM_FORWARD][2][0]EO[0] + g_aiT8[TRANSFORM_FORWARD][2][1]*EO[1] + add)>>shift;
	531	dst[6line] = (g_aiT8[TRANSFORM_FORWARD][6][0]EO[0] + g_aiT8[TRANSFORM_FORWARD][6][1]*EO[1] + add)>>shift;
[56]	532
[1313]	533	dst[line] = (g_aiT8[TRANSFORM_FORWARD][1][0]O[0] + g_aiT8[TRANSFORM_FORWARD][1][1]O[1] + g_aiT8[TRANSFORM_FORWARD][1][2]O[2] + g_aiT8[TRANSFORM_FORWARD][1][3]O[3] + add)>>shift;
	534	dst[3line] = (g_aiT8[TRANSFORM_FORWARD][3][0]O[0] + g_aiT8[TRANSFORM_FORWARD][3][1]O[1] + g_aiT8[TRANSFORM_FORWARD][3][2]O[2] + g_aiT8[TRANSFORM_FORWARD][3][3]*O[3] + add)>>shift;
	535	dst[5line] = (g_aiT8[TRANSFORM_FORWARD][5][0]O[0] + g_aiT8[TRANSFORM_FORWARD][5][1]O[1] + g_aiT8[TRANSFORM_FORWARD][5][2]O[2] + g_aiT8[TRANSFORM_FORWARD][5][3]*O[3] + add)>>shift;
	536	dst[7line] = (g_aiT8[TRANSFORM_FORWARD][7][0]O[0] + g_aiT8[TRANSFORM_FORWARD][7][1]O[1] + g_aiT8[TRANSFORM_FORWARD][7][2]O[2] + g_aiT8[TRANSFORM_FORWARD][7][3]*O[3] + add)>>shift;
[56]	537
	538	src += 8;
	539	dst ++;
	540	}
	541	}
	542
[1313]	543	/** 8x8 inverse transform implemented using partial butterfly structure (1D)
	544	* \param src input data (transform coefficients)
	545	* \param dst output data (residual)
	546	* \param shift specifies right shift after 1D transform
	547	* \param line
	548	* \param outputMinimum minimum for clipping
	549	* \param outputMaximum maximum for clipping
	550	*/
	551	Void partialButterflyInverse8(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
[56]	552	{
[608]	553	Int j,k;
[1313]	554	TCoeff E[4],O[4];
	555	TCoeff EE[2],EO[2];
	556	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	557
[1313]	558	for (j=0; j<line; j++)
	559	{
[56]	560	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
	561	for (k=0;k<4;k++)
	562	{
[1313]	563	O[k] = g_aiT8[TRANSFORM_INVERSE][ 1][k]src[line] + g_aiT8[TRANSFORM_INVERSE][ 3][k]src[3*line] +
	564	g_aiT8[TRANSFORM_INVERSE][ 5][k]src[5line] + g_aiT8[TRANSFORM_INVERSE][ 7][k]src[7line];
[56]	565	}
	566
[1313]	567	EO[0] = g_aiT8[TRANSFORM_INVERSE][2][0]src[ 2line ] + g_aiT8[TRANSFORM_INVERSE][6][0]src[ 6line ];
	568	EO[1] = g_aiT8[TRANSFORM_INVERSE][2][1]src[ 2line ] + g_aiT8[TRANSFORM_INVERSE][6][1]src[ 6line ];
	569	EE[0] = g_aiT8[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT8[TRANSFORM_INVERSE][4][0]src[ 4*line ];
	570	EE[1] = g_aiT8[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT8[TRANSFORM_INVERSE][4][1]src[ 4*line ];
[56]	571
[1313]	572	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
[56]	573	E[0] = EE[0] + EO[0];
	574	E[3] = EE[0] - EO[0];
	575	E[1] = EE[1] + EO[1];
	576	E[2] = EE[1] - EO[1];
	577	for (k=0;k<4;k++)
	578	{
[1313]	579	dst[ k ] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
	580	dst[ k+4 ] = Clip3( outputMinimum, outputMaximum, (E[3-k] - O[3-k] + add)>>shift );
	581	}
[56]	582	src ++;
	583	dst += 8;
	584	}
	585	}
	586
[1313]	587	/** 16x16 forward transform implemented using partial butterfly structure (1D)
	588	* \param src input data (residual)
	589	* \param dst output data (transform coefficients)
	590	* \param shift specifies right shift after 1D transform
	591	* \param line
	592	*/
	593	Void partialButterfly16(TCoeff src, TCoeff dst, Int shift, Int line)
[56]	594	{
[608]	595	Int j,k;
[1313]	596	TCoeff E[8],O[8];
	597	TCoeff EE[4],EO[4];
	598	TCoeff EEE[2],EEO[2];
	599	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	600
[1313]	601	for (j=0; j<line; j++)
	602	{
[56]	603	/* E and O*/
	604	for (k=0;k<8;k++)
	605	{
	606	E[k] = src[k] + src[15-k];
	607	O[k] = src[k] - src[15-k];
[1313]	608	}
[56]	609	/* EE and EO */
	610	for (k=0;k<4;k++)
	611	{
	612	EE[k] = E[k] + E[7-k];
	613	EO[k] = E[k] - E[7-k];
	614	}
	615	/* EEE and EEO */
[1313]	616	EEE[0] = EE[0] + EE[3];
[56]	617	EEO[0] = EE[0] - EE[3];
	618	EEE[1] = EE[1] + EE[2];
	619	EEO[1] = EE[1] - EE[2];
	620
[1313]	621	dst[ 0 ] = (g_aiT16[TRANSFORM_FORWARD][ 0][0]EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 0][1]EEE[1] + add)>>shift;
	622	dst[ 8line ] = (g_aiT16[TRANSFORM_FORWARD][ 8][0]EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 8][1]*EEE[1] + add)>>shift;
	623	dst[ 4line ] = (g_aiT16[TRANSFORM_FORWARD][ 4][0]EEO[0] + g_aiT16[TRANSFORM_FORWARD][ 4][1]*EEO[1] + add)>>shift;
	624	dst[ 12line] = (g_aiT16[TRANSFORM_FORWARD][12][0]EEO[0] + g_aiT16[TRANSFORM_FORWARD][12][1]*EEO[1] + add)>>shift;
[56]	625
	626	for (k=2;k<16;k+=4)
	627	{
[1313]	628	dst[ kline ] = (g_aiT16[TRANSFORM_FORWARD][k][0]EO[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*EO[1] +
	629	g_aiT16[TRANSFORM_FORWARD][k][2]EO[2] + g_aiT16[TRANSFORM_FORWARD][k][3]EO[3] + add)>>shift;
[56]	630	}
	631
	632	for (k=1;k<16;k+=2)
	633	{
[1313]	634	dst[ kline ] = (g_aiT16[TRANSFORM_FORWARD][k][0]O[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*O[1] +
	635	g_aiT16[TRANSFORM_FORWARD][k][2]O[2] + g_aiT16[TRANSFORM_FORWARD][k][3]O[3] +
	636	g_aiT16[TRANSFORM_FORWARD][k][4]O[4] + g_aiT16[TRANSFORM_FORWARD][k][5]O[5] +
	637	g_aiT16[TRANSFORM_FORWARD][k][6]O[6] + g_aiT16[TRANSFORM_FORWARD][k][7]O[7] + add)>>shift;
[56]	638	}
	639
	640	src += 16;
[1313]	641	dst ++;
[56]	642
	643	}
	644	}
	645
[1313]	646	/** 16x16 inverse transform implemented using partial butterfly structure (1D)
	647	* \param src input data (transform coefficients)
	648	* \param dst output data (residual)
	649	* \param shift specifies right shift after 1D transform
	650	* \param line
	651	* \param outputMinimum minimum for clipping
	652	* \param outputMaximum maximum for clipping
	653	*/
	654	Void partialButterflyInverse16(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
[56]	655	{
[608]	656	Int j,k;
[1313]	657	TCoeff E[8],O[8];
	658	TCoeff EE[4],EO[4];
	659	TCoeff EEE[2],EEO[2];
	660	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	661
	662	for (j=0; j<line; j++)
[1313]	663	{
[56]	664	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
	665	for (k=0;k<8;k++)
	666	{
[1313]	667	O[k] = g_aiT16[TRANSFORM_INVERSE][ 1][k]src[ line] + g_aiT16[TRANSFORM_INVERSE][ 3][k]src[ 3*line] +
	668	g_aiT16[TRANSFORM_INVERSE][ 5][k]src[ 5line] + g_aiT16[TRANSFORM_INVERSE][ 7][k]src[ 7line] +
	669	g_aiT16[TRANSFORM_INVERSE][ 9][k]src[ 9line] + g_aiT16[TRANSFORM_INVERSE][11][k]src[11line] +
	670	g_aiT16[TRANSFORM_INVERSE][13][k]src[13line] + g_aiT16[TRANSFORM_INVERSE][15][k]src[15line];
[56]	671	}
	672	for (k=0;k<4;k++)
	673	{
[1313]	674	EO[k] = g_aiT16[TRANSFORM_INVERSE][ 2][k]src[ 2line] + g_aiT16[TRANSFORM_INVERSE][ 6][k]src[ 6line] +
	675	g_aiT16[TRANSFORM_INVERSE][10][k]src[10line] + g_aiT16[TRANSFORM_INVERSE][14][k]src[14line];
[56]	676	}
[1313]	677	EEO[0] = g_aiT16[TRANSFORM_INVERSE][4][0]src[ 4line ] + g_aiT16[TRANSFORM_INVERSE][12][0]src[ 12line ];
	678	EEE[0] = g_aiT16[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT16[TRANSFORM_INVERSE][ 8][0]src[ 8*line ];
	679	EEO[1] = g_aiT16[TRANSFORM_INVERSE][4][1]src[ 4line ] + g_aiT16[TRANSFORM_INVERSE][12][1]src[ 12line ];
	680	EEE[1] = g_aiT16[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT16[TRANSFORM_INVERSE][ 8][1]src[ 8*line ];
[56]	681
[1313]	682	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
[56]	683	for (k=0;k<2;k++)
	684	{
	685	EE[k] = EEE[k] + EEO[k];
	686	EE[k+2] = EEE[1-k] - EEO[1-k];
[1313]	687	}
[56]	688	for (k=0;k<4;k++)
	689	{
	690	E[k] = EE[k] + EO[k];
	691	E[k+4] = EE[3-k] - EO[3-k];
[1313]	692	}
[56]	693	for (k=0;k<8;k++)
	694	{
[1313]	695	dst[k] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
	696	dst[k+8] = Clip3( outputMinimum, outputMaximum, (E[7-k] - O[7-k] + add)>>shift );
	697	}
	698	src ++;
[56]	699	dst += 16;
	700	}
	701	}
	702
[1313]	703	/** 32x32 forward transform implemented using partial butterfly structure (1D)
	704	* \param src input data (residual)
	705	* \param dst output data (transform coefficients)
	706	* \param shift specifies right shift after 1D transform
	707	* \param line
	708	*/
	709	Void partialButterfly32(TCoeff src, TCoeff dst, Int shift, Int line)
[56]	710	{
[608]	711	Int j,k;
[1313]	712	TCoeff E[16],O[16];
	713	TCoeff EE[8],EO[8];
	714	TCoeff EEE[4],EEO[4];
	715	TCoeff EEEE[2],EEEO[2];
	716	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	717
	718	for (j=0; j<line; j++)
[1313]	719	{
[56]	720	/* E and O*/
	721	for (k=0;k<16;k++)
	722	{
	723	E[k] = src[k] + src[31-k];
	724	O[k] = src[k] - src[31-k];
[1313]	725	}
[56]	726	/* EE and EO */
	727	for (k=0;k<8;k++)
	728	{
	729	EE[k] = E[k] + E[15-k];
	730	EO[k] = E[k] - E[15-k];
	731	}
	732	/* EEE and EEO */
	733	for (k=0;k<4;k++)
	734	{
	735	EEE[k] = EE[k] + EE[7-k];
	736	EEO[k] = EE[k] - EE[7-k];
	737	}
	738	/* EEEE and EEEO */
[1313]	739	EEEE[0] = EEE[0] + EEE[3];
[56]	740	EEEO[0] = EEE[0] - EEE[3];
	741	EEEE[1] = EEE[1] + EEE[2];
	742	EEEO[1] = EEE[1] - EEE[2];
	743
[1313]	744	dst[ 0 ] = (g_aiT32[TRANSFORM_FORWARD][ 0][0]EEEE[0] + g_aiT32[TRANSFORM_FORWARD][ 0][1]EEEE[1] + add)>>shift;
	745	dst[ 16line ] = (g_aiT32[TRANSFORM_FORWARD][16][0]EEEE[0] + g_aiT32[TRANSFORM_FORWARD][16][1]*EEEE[1] + add)>>shift;
	746	dst[ 8line ] = (g_aiT32[TRANSFORM_FORWARD][ 8][0]EEEO[0] + g_aiT32[TRANSFORM_FORWARD][ 8][1]*EEEO[1] + add)>>shift;
	747	dst[ 24line ] = (g_aiT32[TRANSFORM_FORWARD][24][0]EEEO[0] + g_aiT32[TRANSFORM_FORWARD][24][1]*EEEO[1] + add)>>shift;
[56]	748	for (k=4;k<32;k+=8)
	749	{
[1313]	750	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][0]EEO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EEO[1] +
	751	g_aiT32[TRANSFORM_FORWARD][k][2]EEO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]EEO[3] + add)>>shift;
	752	}
[56]	753	for (k=2;k<32;k+=4)
	754	{
[1313]	755	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][0]EO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EO[1] +
	756	g_aiT32[TRANSFORM_FORWARD][k][2]EO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]EO[3] +
	757	g_aiT32[TRANSFORM_FORWARD][k][4]EO[4] + g_aiT32[TRANSFORM_FORWARD][k][5]EO[5] +
	758	g_aiT32[TRANSFORM_FORWARD][k][6]EO[6] + g_aiT32[TRANSFORM_FORWARD][k][7]EO[7] + add)>>shift;
	759	}
[56]	760	for (k=1;k<32;k+=2)
	761	{
[1313]	762	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][ 0]O[ 0] + g_aiT32[TRANSFORM_FORWARD][k][ 1]*O[ 1] +
	763	g_aiT32[TRANSFORM_FORWARD][k][ 2]O[ 2] + g_aiT32[TRANSFORM_FORWARD][k][ 3]O[ 3] +
	764	g_aiT32[TRANSFORM_FORWARD][k][ 4]O[ 4] + g_aiT32[TRANSFORM_FORWARD][k][ 5]O[ 5] +
	765	g_aiT32[TRANSFORM_FORWARD][k][ 6]O[ 6] + g_aiT32[TRANSFORM_FORWARD][k][ 7]O[ 7] +
	766	g_aiT32[TRANSFORM_FORWARD][k][ 8]O[ 8] + g_aiT32[TRANSFORM_FORWARD][k][ 9]O[ 9] +
	767	g_aiT32[TRANSFORM_FORWARD][k][10]O[10] + g_aiT32[TRANSFORM_FORWARD][k][11]O[11] +
	768	g_aiT32[TRANSFORM_FORWARD][k][12]O[12] + g_aiT32[TRANSFORM_FORWARD][k][13]O[13] +
	769	g_aiT32[TRANSFORM_FORWARD][k][14]O[14] + g_aiT32[TRANSFORM_FORWARD][k][15]O[15] + add)>>shift;
[56]	770	}
[1313]	771
[56]	772	src += 32;
	773	dst ++;
	774	}
	775	}
	776
[1313]	777	/** 32x32 inverse transform implemented using partial butterfly structure (1D)
	778	* \param src input data (transform coefficients)
	779	* \param dst output data (residual)
	780	* \param shift specifies right shift after 1D transform
	781	* \param line
	782	* \param outputMinimum minimum for clipping
	783	* \param outputMaximum maximum for clipping
	784	*/
	785	Void partialButterflyInverse32(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
[56]	786	{
[608]	787	Int j,k;
[1313]	788	TCoeff E[16],O[16];
	789	TCoeff EE[8],EO[8];
	790	TCoeff EEE[4],EEO[4];
	791	TCoeff EEEE[2],EEEO[2];
	792	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
[56]	793
	794	for (j=0; j<line; j++)
[1313]	795	{
[56]	796	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
	797	for (k=0;k<16;k++)
	798	{
[1313]	799	O[k] = g_aiT32[TRANSFORM_INVERSE][ 1][k]src[ line ] + g_aiT32[TRANSFORM_INVERSE][ 3][k]src[ 3*line ] +
	800	g_aiT32[TRANSFORM_INVERSE][ 5][k]src[ 5line ] + g_aiT32[TRANSFORM_INVERSE][ 7][k]src[ 7line ] +
	801	g_aiT32[TRANSFORM_INVERSE][ 9][k]src[ 9line ] + g_aiT32[TRANSFORM_INVERSE][11][k]src[ 11line ] +
	802	g_aiT32[TRANSFORM_INVERSE][13][k]src[ 13line ] + g_aiT32[TRANSFORM_INVERSE][15][k]src[ 15line ] +
	803	g_aiT32[TRANSFORM_INVERSE][17][k]src[ 17line ] + g_aiT32[TRANSFORM_INVERSE][19][k]src[ 19line ] +
	804	g_aiT32[TRANSFORM_INVERSE][21][k]src[ 21line ] + g_aiT32[TRANSFORM_INVERSE][23][k]src[ 23line ] +
	805	g_aiT32[TRANSFORM_INVERSE][25][k]src[ 25line ] + g_aiT32[TRANSFORM_INVERSE][27][k]src[ 27line ] +
	806	g_aiT32[TRANSFORM_INVERSE][29][k]src[ 29line ] + g_aiT32[TRANSFORM_INVERSE][31][k]src[ 31line ];
[56]	807	}
	808	for (k=0;k<8;k++)
	809	{
[1313]	810	EO[k] = g_aiT32[TRANSFORM_INVERSE][ 2][k]src[ 2line ] + g_aiT32[TRANSFORM_INVERSE][ 6][k]src[ 6line ] +
	811	g_aiT32[TRANSFORM_INVERSE][10][k]src[ 10line ] + g_aiT32[TRANSFORM_INVERSE][14][k]src[ 14line ] +
	812	g_aiT32[TRANSFORM_INVERSE][18][k]src[ 18line ] + g_aiT32[TRANSFORM_INVERSE][22][k]src[ 22line ] +
	813	g_aiT32[TRANSFORM_INVERSE][26][k]src[ 26line ] + g_aiT32[TRANSFORM_INVERSE][30][k]src[ 30line ];
[56]	814	}
	815	for (k=0;k<4;k++)
	816	{
[1313]	817	EEO[k] = g_aiT32[TRANSFORM_INVERSE][ 4][k]src[ 4line ] + g_aiT32[TRANSFORM_INVERSE][12][k]src[ 12line ] +
	818	g_aiT32[TRANSFORM_INVERSE][20][k]src[ 20line ] + g_aiT32[TRANSFORM_INVERSE][28][k]src[ 28line ];
[56]	819	}
[1313]	820	EEEO[0] = g_aiT32[TRANSFORM_INVERSE][8][0]src[ 8line ] + g_aiT32[TRANSFORM_INVERSE][24][0]src[ 24line ];
	821	EEEO[1] = g_aiT32[TRANSFORM_INVERSE][8][1]src[ 8line ] + g_aiT32[TRANSFORM_INVERSE][24][1]src[ 24line ];
	822	EEEE[0] = g_aiT32[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT32[TRANSFORM_INVERSE][16][0]src[ 16*line ];
	823	EEEE[1] = g_aiT32[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT32[TRANSFORM_INVERSE][16][1]src[ 16*line ];
[56]	824
	825	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
	826	EEE[0] = EEEE[0] + EEEO[0];
	827	EEE[3] = EEEE[0] - EEEO[0];
	828	EEE[1] = EEEE[1] + EEEO[1];
[1313]	829	EEE[2] = EEEE[1] - EEEO[1];
[56]	830	for (k=0;k<4;k++)
	831	{
	832	EE[k] = EEE[k] + EEO[k];
	833	EE[k+4] = EEE[3-k] - EEO[3-k];
[1313]	834	}
[56]	835	for (k=0;k<8;k++)
	836	{
	837	E[k] = EE[k] + EO[k];
	838	E[k+8] = EE[7-k] - EO[7-k];
[1313]	839	}
[56]	840	for (k=0;k<16;k++)
	841	{
[1313]	842	dst[k] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
	843	dst[k+16] = Clip3( outputMinimum, outputMaximum, (E[15-k] - O[15-k] + add)>>shift );
[56]	844	}
	845	src ++;
	846	dst += 32;
	847	}
	848	}
	849
	850	/** MxN forward transform (2D)
[1313]	851	* \param bitDepth [in] bit depth
	852	* \param block [in] residual block
	853	* \param coeff [out] transform coefficients
	854	* \param iWidth [in] width of transform
	855	* \param iHeight [in] height of transform
	856	* \param useDST [in]
	857	* \param maxLog2TrDynamicRange [in]
	858
[56]	859	*/
[1313]	860	Void xTrMxN(Int bitDepth, TCoeff block, TCoeff coeff, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
[2]	861	{
[1313]	862	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
[2]	863
[1313]	864	const Int shift_1st = ((g_aucConvertToBit[iWidth] + 2) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
	865	const Int shift_2nd = (g_aucConvertToBit[iHeight] + 2) + TRANSFORM_MATRIX_SHIFT;
[2]	866
[1313]	867	assert(shift_1st >= 0);
	868	assert(shift_2nd >= 0);
	869
	870	TCoeff tmp[ MAX_TU_SIZE * MAX_TU_SIZE ];
	871
	872	switch (iWidth)
[2]	873	{
[1313]	874	case 4:
	875	{
	876	if ((iHeight == 4) && useDST) // Check for DCT or DST
	877	{
	878	fastForwardDst( block, tmp, shift_1st );
	879	}
	880	else
	881	{
	882	partialButterfly4 ( block, tmp, shift_1st, iHeight );
	883	}
	884	}
	885	break;
[608]	886
[1313]	887	case 8: partialButterfly8 ( block, tmp, shift_1st, iHeight ); break;
	888	case 16: partialButterfly16( block, tmp, shift_1st, iHeight ); break;
	889	case 32: partialButterfly32( block, tmp, shift_1st, iHeight ); break;
	890	default:
	891	assert(0); exit (1); break;
[2]	892	}
[1313]	893
	894	switch (iHeight)
[2]	895	{
[1313]	896	case 4:
	897	{
	898	if ((iWidth == 4) && useDST) // Check for DCT or DST
	899	{
	900	fastForwardDst( tmp, coeff, shift_2nd );
	901	}
	902	else
	903	{
	904	partialButterfly4 ( tmp, coeff, shift_2nd, iWidth );
	905	}
	906	}
	907	break;
	908
	909	case 8: partialButterfly8 ( tmp, coeff, shift_2nd, iWidth ); break;
	910	case 16: partialButterfly16( tmp, coeff, shift_2nd, iWidth ); break;
	911	case 32: partialButterfly32( tmp, coeff, shift_2nd, iWidth ); break;
	912	default:
	913	assert(0); exit (1); break;
[2]	914	}
[56]	915	}
[1313]	916
	917
[56]	918	/** MxN inverse transform (2D)
[1313]	919	* \param bitDepth [in] bit depth
	920	* \param coeff [in] transform coefficients
	921	* \param block [out] residual block
	922	* \param iWidth [in] width of transform
	923	* \param iHeight [in] height of transform
	924	* \param useDST [in]
	925	* \param maxLog2TrDynamicRange [in]
[56]	926	*/
[1313]	927	Void xITrMxN(Int bitDepth, TCoeff coeff, TCoeff block, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
[56]	928	{
[1313]	929	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
[2]	930
[1313]	931	Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
	932	Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
	933	const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
	934	const TCoeff clipMaximum = (1 << maxLog2TrDynamicRange) - 1;
	935
	936	assert(shift_1st >= 0);
	937	assert(shift_2nd >= 0);
	938
	939	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
	940
	941	switch (iHeight)
[56]	942	{
[1313]	943	case 4:
	944	{
	945	if ((iWidth == 4) && useDST) // Check for DCT or DST
	946	{
	947	fastInverseDst( coeff, tmp, shift_1st, clipMinimum, clipMaximum);
	948	}
	949	else
	950	{
	951	partialButterflyInverse4 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum);
	952	}
	953	}
	954	break;
	955
	956	case 8: partialButterflyInverse8 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
	957	case 16: partialButterflyInverse16( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
	958	case 32: partialButterflyInverse32( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
	959
	960	default:
	961	assert(0); exit (1); break;
[2]	962	}
[1313]	963
	964	switch (iWidth)
[2]	965	{
[1313]	966	// Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
	967	case 4:
	968	{
	969	if ((iHeight == 4) && useDST) // Check for DCT or DST
	970	{
	971	fastInverseDst( tmp, block, shift_2nd, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max() );
	972	}
	973	else
	974	{
	975	partialButterflyInverse4 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max());
	976	}
	977	}
	978	break;
	979
	980	case 8: partialButterflyInverse8 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
	981	case 16: partialButterflyInverse16( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
	982	case 32: partialButterflyInverse32( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
	983
	984	default:
	985	assert(0); exit (1); break;
[2]	986	}
[56]	987	}
[2]	988
	989
[1313]	990	// To minimize the distortion only. No rate is considered.
	991	Void TComTrQuant::signBitHidingHDQ( TCoeff* pQCoef, TCoeff* pCoef, TCoeff* deltaU, const TUEntropyCodingParameters &codingParameters, const Int maxLog2TrDynamicRange )
[56]	992	{
[1313]	993	const UInt width = codingParameters.widthInGroups << MLS_CG_LOG2_WIDTH;
	994	const UInt height = codingParameters.heightInGroups << MLS_CG_LOG2_HEIGHT;
	995	const UInt groupSize = 1 << MLS_CG_SIZE;
	996
	997	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
	998	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
	999
[56]	1000	Int lastCG = -1;
	1001	Int absSum = 0 ;
	1002	Int n ;
[2]	1003
[1313]	1004	for( Int subSet = (width*height-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
[56]	1005	{
[1313]	1006	Int subPos = subSet << MLS_CG_SIZE;
	1007	Int firstNZPosInCG=groupSize , lastNZPosInCG=-1 ;
[56]	1008	absSum = 0 ;
[2]	1009
[1313]	1010	for(n = groupSize-1; n >= 0; --n )
[56]	1011	{
[1313]	1012	if( pQCoef[ codingParameters.scan[ n + subPos ]] )
[56]	1013	{
	1014	lastNZPosInCG = n;
	1015	break;
	1016	}
	1017	}
[2]	1018
[1313]	1019	for(n = 0; n <groupSize; n++ )
[56]	1020	{
[1313]	1021	if( pQCoef[ codingParameters.scan[ n + subPos ]] )
[56]	1022	{
	1023	firstNZPosInCG = n;
	1024	break;
	1025	}
	1026	}
[2]	1027
[56]	1028	for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
	1029	{
[1313]	1030	absSum += Int(pQCoef[ codingParameters.scan[ n + subPos ]]);
[56]	1031	}
[2]	1032
[1313]	1033	if(lastNZPosInCG>=0 && lastCG==-1)
[56]	1034	{
[1313]	1035	lastCG = 1 ;
[56]	1036	}
[608]	1037
	1038	if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
[56]	1039	{
[1313]	1040	UInt signbit = (pQCoef[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1) ;
[56]	1041	if( signbit!=(absSum&0x1) ) //compare signbit with sum_parity
	1042	{
[1313]	1043	TCoeff curCost = std::numeric_limits<TCoeff>::max();
	1044	TCoeff minCostInc = std::numeric_limits<TCoeff>::max();
	1045	Int minPos =-1, finalChange=0, curChange=0;
	1046
	1047	for( n = (lastCG==1?lastNZPosInCG:groupSize-1) ; n >= 0; --n )
[2]	1048	{
[1313]	1049	UInt blkPos = codingParameters.scan[ n+subPos ];
[56]	1050	if(pQCoef[ blkPos ] != 0 )
[2]	1051	{
[56]	1052	if(deltaU[blkPos]>0)
[2]	1053	{
[1313]	1054	curCost = - deltaU[blkPos];
[56]	1055	curChange=1 ;
[2]	1056	}
[1313]	1057	else
[2]	1058	{
[56]	1059	//curChange =-1;
	1060	if(n==firstNZPosInCG && abs(pQCoef[blkPos])==1)
	1061	{
[1313]	1062	curCost = std::numeric_limits<TCoeff>::max();
[56]	1063	}
	1064	else
	1065	{
[1313]	1066	curCost = deltaU[blkPos];
[56]	1067	curChange =-1;
	1068	}
[2]	1069	}
	1070	}
	1071	else
	1072	{
[56]	1073	if(n<firstNZPosInCG)
	1074	{
	1075	UInt thisSignBit = (pCoef[blkPos]>=0?0:1);
	1076	if(thisSignBit != signbit )
	1077	{
[1313]	1078	curCost = std::numeric_limits<TCoeff>::max();
[2]	1079	}
[56]	1080	else
[1313]	1081	{
[56]	1082	curCost = - (deltaU[blkPos]) ;
	1083	curChange = 1 ;
[2]	1084	}
	1085	}
	1086	else
	1087	{
[56]	1088	curCost = - (deltaU[blkPos]) ;
	1089	curChange = 1 ;
[2]	1090	}
	1091	}
[56]	1092
	1093	if( curCost<minCostInc)
[2]	1094	{
[56]	1095	minCostInc = curCost ;
	1096	finalChange = curChange ;
	1097	minPos = blkPos ;
[2]	1098	}
[56]	1099	} //CG loop
[2]	1100
[1313]	1101	if(pQCoef[minPos] == entropyCodingMaximum \|\| pQCoef[minPos] == entropyCodingMinimum)
[56]	1102	{
	1103	finalChange = -1;
[2]	1104	}
	1105
[56]	1106	if(pCoef[minPos]>=0)
[2]	1107	{
[1313]	1108	pQCoef[minPos] += finalChange ;
[2]	1109	}
[1313]	1110	else
	1111	{
[56]	1112	pQCoef[minPos] -= finalChange ;
[1313]	1113	}
[56]	1114	} // Hide
	1115	}
[1313]	1116	if(lastCG==1)
[56]	1117	{
	1118	lastCG=0 ;
	1119	}
	1120	} // TU loop
	1121
	1122	return;
	1123	}
	1124
[1313]	1125
	1126	Void TComTrQuant::xQuant( TComTU &rTu,
	1127	TCoeff * pSrc,
	1128	TCoeff * pDes,
[56]	1129	#if ADAPTIVE_QP_SELECTION
[1313]	1130	TCoeff *pArlDes,
[56]	1131	#endif
[1313]	1132	TCoeff &uiAbsSum,
	1133	const ComponentID compID,
	1134	const QpParam &cQP )
[2]	1135	{
[1313]	1136	const TComRectangle &rect = rTu.getRect(compID);
	1137	const UInt uiWidth = rect.width;
	1138	const UInt uiHeight = rect.height;
	1139	TComDataCU* pcCU = rTu.getCU();
	1140	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1141	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1142
	1143	TCoeff* piCoef = pSrc;
[56]	1144	TCoeff* piQCoef = pDes;
	1145	#if ADAPTIVE_QP_SELECTION
[1313]	1146	TCoeff* piArlCCoef = pArlDes;
[56]	1147	#endif
[1313]	1148
	1149	const Bool useTransformSkip = pcCU->getTransformSkip(uiAbsPartIdx, compID);
	1150	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	1151
	1152	Bool useRDOQ = useTransformSkip ? m_useRDOQTS : m_useRDOQ;
	1153	if ( useRDOQ && (isLuma(compID) \|\| RDOQ_CHROMA) )
[2]	1154	{
[1313]	1155	#if T0196_SELECTIVE_RDOQ
	1156	if ( !m_useSelectiveRDOQ \|\| xNeedRDOQ( rTu, piCoef, compID, cQP ) )
	1157	{
	1158	#endif
[56]	1159	#if ADAPTIVE_QP_SELECTION
[1313]	1160	xRateDistOptQuant( rTu, piCoef, pDes, pArlDes, uiAbsSum, compID, cQP );
[2]	1161	#else
[1313]	1162	xRateDistOptQuant( rTu, piCoef, pDes, uiAbsSum, compID, cQP );
[2]	1163	#endif
[1313]	1164	#if T0196_SELECTIVE_RDOQ
	1165	}
	1166	else
	1167	{
	1168	memset( pDes, 0, sizeof( TCoeff ) * uiWidth *uiHeight );
	1169	uiAbsSum = 0;
	1170	}
	1171	#endif
[2]	1172	}
	1173	else
	1174	{
[1313]	1175	TUEntropyCodingParameters codingParameters;
	1176	getTUEntropyCodingParameters(codingParameters, rTu, compID);
[56]	1177
[1313]	1178	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
	1179	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
[56]	1180
[1313]	1181	TCoeff deltaU[MAX_TU_SIZE * MAX_TU_SIZE];
[56]	1182
[1313]	1183	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
[56]	1184
[1313]	1185	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	1186	assert(scalingListType < SCALING_LIST_NUM);
	1187	Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
	1188
	1189	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
	1190	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
	1191
	1192	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
	1193	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
	1194	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
	1195	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
	1196	*/
	1197
	1198	// Represents scaling through forward transform
	1199	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
	1200	if (useTransformSkip && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
[2]	1201	{
[1313]	1202	iTransformShift = std::max<Int>(0, iTransformShift);
[2]	1203	}
[1313]	1204
	1205	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
	1206	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
	1207
	1208	#if ADAPTIVE_QP_SELECTION
	1209	Int iQBitsC = MAX_INT;
	1210	Int iAddC = MAX_INT;
	1211
	1212	if (m_bUseAdaptQpSelect)
[56]	1213	{
[1313]	1214	iQBitsC = iQBits - ARL_C_PRECISION;
	1215	iAddC = 1 << (iQBitsC-1);
[2]	1216	}
	1217	#endif
	1218
[1313]	1219	const Int iAdd = (pcCU->getSlice()->getSliceType()==I_SLICE ? 171 : 85) << (iQBits-9);
	1220	const Int qBits8 = iQBits - 8;
[2]	1221
[1313]	1222	for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
	1223	{
	1224	const TCoeff iLevel = piCoef[uiBlockPos];
	1225	const TCoeff iSign = (iLevel < 0 ? -1: 1);
[2]	1226
[1313]	1227	const Int64 tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
[56]	1228
	1229	#if ADAPTIVE_QP_SELECTION
	1230	if( m_bUseAdaptQpSelect )
[2]	1231	{
[1313]	1232	piArlCCoef[uiBlockPos] = (TCoeff)((tmpLevel + iAddC ) >> iQBitsC);
[2]	1233	}
	1234	#endif
[1313]	1235
	1236	const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
	1237	deltaU[uiBlockPos] = (TCoeff)((tmpLevel - (quantisedMagnitude<<iQBits) )>> qBits8);
	1238
	1239	uiAbsSum += quantisedMagnitude;
	1240	const TCoeff quantisedCoefficient = quantisedMagnitude * iSign;
	1241
	1242	piQCoef[uiBlockPos] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
[2]	1243	} // for n
[1313]	1244
[56]	1245	if( pcCU->getSlice()->getPPS()->getSignHideFlag() )
[2]	1246	{
[1313]	1247	if(uiAbsSum >= 2) //this prevents TUs with only one coefficient of value 1 from being tested
[2]	1248	{
[1313]	1249	signBitHidingHDQ( piQCoef, piCoef, deltaU, codingParameters, maxLog2TrDynamicRange ) ;
[2]	1250	}
	1251	}
[56]	1252	} //if RDOQ
	1253	//return;
[2]	1254	}
	1255
[1313]	1256	#if T0196_SELECTIVE_RDOQ
	1257	Bool TComTrQuant::xNeedRDOQ( TComTU &rTu, TCoeff * pSrc, const ComponentID compID, const QpParam &cQP )
[2]	1258	{
[1313]	1259	const TComRectangle &rect = rTu.getRect(compID);
	1260	const UInt uiWidth = rect.width;
	1261	const UInt uiHeight = rect.height;
	1262	TComDataCU* pcCU = rTu.getCU();
	1263	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1264	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1265
	1266	TCoeff* piCoef = pSrc;
	1267
	1268	const Bool useTransformSkip = pcCU->getTransformSkip(uiAbsPartIdx, compID);
	1269	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	1270
	1271	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
	1272
	1273	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	1274	assert(scalingListType < SCALING_LIST_NUM);
	1275	Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
	1276
	1277	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
	1278	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
	1279
	1280	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
	1281	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
	1282	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
	1283	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
	1284	*/
	1285
	1286	// Represents scaling through forward transform
	1287	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
	1288	if (useTransformSkip && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
[2]	1289	{
[1313]	1290	iTransformShift = std::max<Int>(0, iTransformShift);
[2]	1291	}
	1292
[1313]	1293	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
	1294	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
[608]	1295
[1313]	1296	// iAdd is different from the iAdd used in normal quantization
	1297	const Int iAdd = (compID == COMPONENT_Y ? 171 : 256) << (iQBits-9);
[2]	1298
[1313]	1299	for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
	1300	{
	1301	const TCoeff iLevel = piCoef[uiBlockPos];
	1302	const Int64 tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
	1303	const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
[56]	1304
[1313]	1305	if ( quantisedMagnitude != 0 )
	1306	{
	1307	return true;
	1308	}
	1309	} // for n
	1310	return false;
	1311	}
	1312	#endif
	1313
	1314	Void TComTrQuant::xDeQuant( TComTU &rTu,
	1315	const TCoeff * pSrc,
	1316	TCoeff * pDes,
	1317	const ComponentID compID,
	1318	const QpParam &cQP )
	1319	{
	1320	assert(compID<MAX_NUM_COMPONENT);
	1321
	1322	TComDataCU *pcCU = rTu.getCU();
	1323	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1324	const TComRectangle &rect = rTu.getRect(compID);
	1325	const UInt uiWidth = rect.width;
	1326	const UInt uiHeight = rect.height;
	1327	const TCoeff *const piQCoef = pSrc;
	1328	TCoeff *const piCoef = pDes;
	1329	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
	1330	const UInt numSamplesInBlock = uiWidth*uiHeight;
	1331	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	1332	const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
	1333	const TCoeff transformMaximum = (1 << maxLog2TrDynamicRange) - 1;
	1334	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
	1335	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	1336	#if O0043_BEST_EFFORT_DECODING
	1337	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
	1338	#else
	1339	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1340	#endif
	1341
	1342	assert (scalingListType < SCALING_LIST_NUM);
	1343	assert ( uiWidth <= m_uiMaxTrSize );
	1344
	1345	// Represents scaling through forward transform
	1346	const Bool bClipTransformShiftTo0 = (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
	1347	const Int originalTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
	1348	const Int iTransformShift = bClipTransformShiftTo0 ? std::max<Int>(0, originalTransformShift) : originalTransformShift;
	1349
	1350	const Int QP_per = cQP.per;
	1351	const Int QP_rem = cQP.rem;
	1352
	1353	const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
	1354
	1355	if(enableScalingLists)
[2]	1356	{
[1313]	1357	//from the dequantisation equation:
	1358	//iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[deQuantIdx]) + iAdd ) >> rightShift
	1359	//(sizeof(Intermediate_Int) * 8) = inputBitDepth + dequantCoefBits - rightShift
	1360	const UInt dequantCoefBits = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
	1361	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
[608]	1362
[1313]	1363	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
	1364	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
	1365
	1366	Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
	1367
	1368	if(rightShift > 0)
[2]	1369	{
[1313]	1370	const Intermediate_Int iAdd = 1 << (rightShift - 1);
	1371
	1372	for( Int n = 0; n < numSamplesInBlock; n++ )
[56]	1373	{
[1313]	1374	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
	1375	const Intermediate_Int iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[n]) + iAdd ) >> rightShift;
	1376
	1377	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
[56]	1378	}
[2]	1379	}
	1380	else
	1381	{
[1313]	1382	const Int leftShift = -rightShift;
	1383
	1384	for( Int n = 0; n < numSamplesInBlock; n++ )
[56]	1385	{
[1313]	1386	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
	1387	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * piDequantCoef[n]) << leftShift;
	1388
	1389	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
[56]	1390	}
	1391	}
[2]	1392	}
[56]	1393	else
[2]	1394	{
[1313]	1395	const Int scale = g_invQuantScales[QP_rem];
	1396	const Int scaleBits = (IQUANT_SHIFT + 1) ;
[2]	1397
[1313]	1398	//from the dequantisation equation:
	1399	//iCoeffQ = Intermediate_Int((Int64(clipQCoef) * scale + iAdd) >> rightShift);
	1400	//(sizeof(Intermediate_Int) * 8) = inputBitDepth + scaleBits - rightShift
	1401	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
	1402	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
	1403	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
	1404
	1405	if (rightShift > 0)
[56]	1406	{
[1313]	1407	const Intermediate_Int iAdd = 1 << (rightShift - 1);
	1408
	1409	for( Int n = 0; n < numSamplesInBlock; n++ )
	1410	{
	1411	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
	1412	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
	1413
	1414	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	1415	}
[56]	1416	}
[1313]	1417	else
	1418	{
	1419	const Int leftShift = -rightShift;
	1420
	1421	for( Int n = 0; n < numSamplesInBlock; n++ )
	1422	{
	1423	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
	1424	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale) << leftShift;
	1425
	1426	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	1427	}
	1428	}
[2]	1429	}
	1430	}
[56]	1431
[1313]	1432
	1433	Void TComTrQuant::init( UInt uiMaxTrSize,
	1434	Bool bUseRDOQ,
	1435	Bool bUseRDOQTS,
	1436	#if T0196_SELECTIVE_RDOQ
	1437	Bool useSelectiveRDOQ,
	1438	#endif
	1439	Bool bEnc,
	1440	Bool useTransformSkipFast
[56]	1441	#if ADAPTIVE_QP_SELECTION
[1313]	1442	, Bool bUseAdaptQpSelect
[2]	1443	#endif
[56]	1444	)
[2]	1445	{
	1446	m_uiMaxTrSize = uiMaxTrSize;
	1447	m_bEnc = bEnc;
[1313]	1448	m_useRDOQ = bUseRDOQ;
	1449	m_useRDOQTS = bUseRDOQTS;
	1450	#if T0196_SELECTIVE_RDOQ
	1451	m_useSelectiveRDOQ = useSelectiveRDOQ;
	1452	#endif
[56]	1453	#if ADAPTIVE_QP_SELECTION
	1454	m_bUseAdaptQpSelect = bUseAdaptQpSelect;
[2]	1455	#endif
[608]	1456	m_useTransformSkipFast = useTransformSkipFast;
[2]	1457	}
	1458
[1313]	1459
	1460	Void TComTrQuant::transformNxN( TComTU & rTu,
	1461	const ComponentID compID,
	1462	Pel * pcResidual,
	1463	const UInt uiStride,
	1464	TCoeff * rpcCoeff,
[56]	1465	#if ADAPTIVE_QP_SELECTION
[1313]	1466	TCoeff * pcArlCoeff,
[2]	1467	#endif
[1313]	1468	TCoeff & uiAbsSum,
	1469	const QpParam & cQP
	1470	)
[2]	1471	{
[1313]	1472	const TComRectangle &rect = rTu.getRect(compID);
	1473	const UInt uiWidth = rect.width;
	1474	const UInt uiHeight = rect.height;
	1475	TComDataCU* pcCU = rTu.getCU();
	1476	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1477	const UInt uiOrgTrDepth = rTu.GetTransformDepthRel();
	1478
	1479	uiAbsSum=0;
	1480
	1481	RDPCMMode rdpcmMode = RDPCM_OFF;
	1482	rdpcmNxN( rTu, compID, pcResidual, uiStride, cQP, rpcCoeff, uiAbsSum, rdpcmMode );
	1483
	1484	if (rdpcmMode == RDPCM_OFF)
[2]	1485	{
[1313]	1486	uiAbsSum = 0;
	1487	//transform and quantise
	1488	if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
[2]	1489	{
[1313]	1490	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
	1491	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
	1492
	1493	for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
[2]	1494	{
[1313]	1495	for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
	1496	{
	1497	const Pel currentSample = pcResidual[(y * uiStride) + x];
	1498
	1499	rpcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = currentSample;
	1500	uiAbsSum += TCoeff(abs(currentSample));
	1501	}
[2]	1502	}
	1503	}
[1313]	1504	else
	1505	{
	1506	#if DEBUG_TRANSFORM_AND_QUANTISE
	1507	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to transform\n";
	1508	printBlock(pcResidual, uiWidth, uiHeight, uiStride);
	1509	#endif
	1510
	1511	assert( (pcCU->getSlice()->getSPS()->getMaxTrSize() >= uiWidth) );
	1512
	1513	if(pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0)
	1514	{
	1515	xTransformSkip( pcResidual, uiStride, m_plTempCoeff, rTu, compID );
	1516	}
	1517	else
	1518	{
	1519	const Int channelBitDepth=pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1520	xT( channelBitDepth, rTu.useDST(compID), pcResidual, uiStride, m_plTempCoeff, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
	1521	}
	1522
	1523	#if DEBUG_TRANSFORM_AND_QUANTISE
	1524	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between transform and quantiser\n";
	1525	printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
	1526	#endif
	1527
	1528	xQuant( rTu, m_plTempCoeff, rpcCoeff,
	1529
	1530	#if ADAPTIVE_QP_SELECTION
	1531	pcArlCoeff,
	1532	#endif
	1533	uiAbsSum, compID, cQP );
	1534
	1535	#if DEBUG_TRANSFORM_AND_QUANTISE
	1536	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of quantiser\n";
	1537	printBlock(rpcCoeff, uiWidth, uiHeight, uiWidth);
	1538	#endif
	1539	}
[2]	1540	}
[1313]	1541
	1542	//set the CBF
	1543	pcCU->setCbfPartRange((((uiAbsSum > 0) ? 1 : 0) << uiOrgTrDepth), compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
	1544	}
	1545
	1546
	1547	Void TComTrQuant::invTransformNxN( TComTU &rTu,
	1548	const ComponentID compID,
	1549	Pel *pcResidual,
	1550	const UInt uiStride,
	1551	TCoeff * pcCoeff,
	1552	const QpParam &cQP
	1553	DEBUG_STRING_FN_DECLAREP(psDebug))
	1554	{
	1555	TComDataCU* pcCU=rTu.getCU();
	1556	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	1557	const TComRectangle &rect = rTu.getRect(compID);
	1558	const UInt uiWidth = rect.width;
	1559	const UInt uiHeight = rect.height;
	1560
	1561	if (uiWidth != uiHeight) //for intra, the TU will have been split above this level, so this condition won't be true, hence this only affects inter
[2]	1562	{
[1313]	1563	//------------------------------------------------
	1564
	1565	//recurse deeper
	1566
	1567	TComTURecurse subTURecurse(rTu, false, TComTU::VERTICAL_SPLIT, true, compID);
	1568
	1569	do
	1570	{
	1571	//------------------
	1572
	1573	const UInt lineOffset = subTURecurse.GetSectionNumber() * subTURecurse.getRect(compID).height;
	1574
	1575	Pel subTUResidual = pcResidual + (lineOffset uiStride);
	1576	TCoeff subTUCoefficients = pcCoeff + (lineOffset subTURecurse.getRect(compID).width);
	1577
	1578	invTransformNxN(subTURecurse, compID, subTUResidual, uiStride, subTUCoefficients, cQP DEBUG_STRING_PASS_INTO(psDebug));
	1579
	1580	//------------------
	1581
	1582	} while (subTURecurse.nextSection(rTu));
	1583
	1584	//------------------------------------------------
	1585
	1586	return;
[2]	1587	}
[1313]	1588
	1589	#if DEBUG_STRING
	1590	if (psDebug)
[2]	1591	{
[1313]	1592	std::stringstream ss(stringstream::out);
	1593	printBlockToStream(ss, (compID==0)?"###InvTran ip Ch0: " : ((compID==1)?"###InvTran ip Ch1: ":"###InvTran ip Ch2: "), pcCoeff, uiWidth, uiHeight, uiWidth);
	1594	DEBUG_STRING_APPEND((*psDebug), ss.str())
[2]	1595	}
[1313]	1596	#endif
	1597
	1598	if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
[608]	1599	{
[1313]	1600	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
	1601	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
	1602
	1603	for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
	1604	{
	1605	for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
	1606	{
	1607	pcResidual[(y * uiStride) + x] = Pel(pcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex]);
	1608	}
	1609	}
[608]	1610	}
	1611	else
	1612	{
[1313]	1613	#if DEBUG_TRANSFORM_AND_QUANTISE
	1614	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to dequantiser\n";
	1615	printBlock(pcCoeff, uiWidth, uiHeight, uiWidth);
	1616	#endif
	1617
	1618	xDeQuant(rTu, pcCoeff, m_plTempCoeff, compID, cQP);
	1619
	1620	#if DEBUG_TRANSFORM_AND_QUANTISE
	1621	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between dequantiser and inverse-transform\n";
	1622	printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
	1623	#endif
	1624
	1625	#if DEBUG_STRING
	1626	if (psDebug)
	1627	{
	1628	std::stringstream ss(stringstream::out);
	1629	printBlockToStream(ss, "###InvTran deq: ", m_plTempCoeff, uiWidth, uiHeight, uiWidth);
	1630	(*psDebug)+=ss.str();
	1631	}
	1632	#endif
	1633
	1634	if(pcCU->getTransformSkip(uiAbsPartIdx, compID))
	1635	{
	1636	xITransformSkip( m_plTempCoeff, pcResidual, uiStride, rTu, compID );
	1637
	1638	#if DEBUG_STRING
	1639	if (psDebug)
	1640	{
	1641	std::stringstream ss(stringstream::out);
	1642	printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
	1643	(*psDebug)+=ss.str();
	1644	(*psDebug)+="(<- was a Transform-skipped block)\n";
	1645	}
	1646	#endif
	1647	}
	1648	else
	1649	{
	1650	#if O0043_BEST_EFFORT_DECODING
	1651	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
	1652	#else
	1653	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	1654	#endif
	1655	xIT( channelBitDepth, rTu.useDST(compID), m_plTempCoeff, pcResidual, uiStride, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
	1656
	1657	#if DEBUG_STRING
	1658	if (psDebug)
	1659	{
	1660	std::stringstream ss(stringstream::out);
	1661	printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
	1662	(*psDebug)+=ss.str();
	1663	(*psDebug)+="(<- was a Transformed block)\n";
	1664	}
	1665	#endif
	1666	}
	1667
	1668	#if DEBUG_TRANSFORM_AND_QUANTISE
	1669	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of inverse-transform\n";
	1670	printBlock(pcResidual, uiWidth, uiHeight, uiStride);
	1671	g_debugCounter++;
	1672	#endif
[608]	1673	}
[1313]	1674
	1675	invRdpcmNxN( rTu, compID, pcResidual, uiStride );
[2]	1676	}
	1677
[1313]	1678	Void TComTrQuant::invRecurTransformNxN( const ComponentID compID,
	1679	TComYuv *pResidual,
	1680	TComTU &rTu)
[2]	1681	{
[1313]	1682	if (!rTu.ProcessComponentSection(compID))
[2]	1683	{
[1313]	1684	return;
	1685	}
	1686
	1687	TComDataCU* pcCU = rTu.getCU();
	1688	UInt absPartIdxTU = rTu.GetAbsPartIdxTU();
	1689	UInt uiTrMode=rTu.GetTransformDepthRel();
	1690	if( (pcCU->getCbf(absPartIdxTU, compID, uiTrMode) == 0) && (isLuma(compID) \|\| !pcCU->getSlice()->getPPS()->getPpsRangeExtension().getCrossComponentPredictionEnabledFlag()) )
	1691	{
	1692	return;
	1693	}
	1694
	1695	if( uiTrMode == pcCU->getTransformIdx( absPartIdxTU ) )
	1696	{
	1697	const TComRectangle &tuRect = rTu.getRect(compID);
	1698	const Int uiStride = pResidual->getStride( compID );
	1699	Pel *rpcResidual = pResidual->getAddr( compID );
	1700	UInt uiAddr = (tuRect.x0 + uiStride*tuRect.y0);
	1701	Pel *pResi = rpcResidual + uiAddr;
	1702	TCoeff *pcCoeff = pcCU->getCoeff(compID) + rTu.getCoefficientOffset(compID);
	1703
	1704	const QpParam cQP(*pcCU, compID);
	1705
	1706	if(pcCU->getCbf(absPartIdxTU, compID, uiTrMode) != 0)
[2]	1707	{
[1313]	1708	DEBUG_STRING_NEW(sTemp)
	1709	#if DEBUG_STRING
	1710	std::string *psDebug=((DebugOptionList::DebugString_InvTran.getInt()&(pcCU->isIntra(absPartIdxTU)?1:(pcCU->isInter(absPartIdxTU)?2:4)))!=0) ? &sTemp : 0;
	1711	#endif
	1712
	1713	invTransformNxN( rTu, compID, pResi, uiStride, pcCoeff, cQP DEBUG_STRING_PASS_INTO(psDebug) );
	1714
	1715	#if DEBUG_STRING
	1716	if (psDebug != 0)
[56]	1717	{
[1313]	1718	std::cout << (*psDebug);
[56]	1719	}
[1313]	1720	#endif
	1721	}
	1722
	1723	if (isChroma(compID) && (pcCU->getCrossComponentPredictionAlpha(absPartIdxTU, compID) != 0))
	1724	{
	1725	const Pel *piResiLuma = pResidual->getAddr( COMPONENT_Y );
	1726	const Int strideLuma = pResidual->getStride( COMPONENT_Y );
	1727	const Int tuWidth = rTu.getRect( compID ).width;
	1728	const Int tuHeight = rTu.getRect( compID ).height;
	1729
	1730	if(pcCU->getCbf(absPartIdxTU, COMPONENT_Y, uiTrMode) != 0)
	1731	{
	1732	pResi = rpcResidual + uiAddr;
	1733	const Pel *pResiLuma = piResiLuma + uiAddr;
	1734
	1735	crossComponentPrediction( rTu, compID, pResiLuma, pResi, pResi, tuWidth, tuHeight, strideLuma, uiStride, uiStride, true );
	1736	}
	1737	}
[2]	1738	}
[1313]	1739	else
[608]	1740	{
[1313]	1741	TComTURecurse tuRecurseChild(rTu, false);
	1742	do
	1743	{
	1744	invRecurTransformNxN( compID, pResidual, tuRecurseChild );
	1745	} while (tuRecurseChild.nextSection(rTu));
[608]	1746	}
[1313]	1747	}
	1748
	1749	Void TComTrQuant::applyForwardRDPCM( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, const RDPCMMode mode )
	1750	{
	1751	TComDataCU *pcCU=rTu.getCU();
	1752	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
	1753
	1754	const Bool bLossless = pcCU->getCUTransquantBypass( uiAbsPartIdx );
	1755	const UInt uiWidth = rTu.getRect(compID).width;
	1756	const UInt uiHeight = rTu.getRect(compID).height;
	1757	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
	1758	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
	1759
	1760	UInt uiX = 0;
	1761	UInt uiY = 0;
	1762
	1763	UInt &majorAxis = (mode == RDPCM_VER) ? uiX : uiY;
	1764	UInt &minorAxis = (mode == RDPCM_VER) ? uiY : uiX;
	1765	const UInt majorAxisLimit = (mode == RDPCM_VER) ? uiWidth : uiHeight;
	1766	const UInt minorAxisLimit = (mode == RDPCM_VER) ? uiHeight : uiWidth;
	1767
	1768	const Bool bUseHalfRoundingPoint = (mode != RDPCM_OFF);
	1769
	1770	uiAbsSum = 0;
	1771
	1772	for ( majorAxis = 0; majorAxis < majorAxisLimit; majorAxis++ )
[608]	1773	{
[1313]	1774	TCoeff accumulatorValue = 0; // 32-bit accumulator
	1775	for ( minorAxis = 0; minorAxis < minorAxisLimit; minorAxis++ )
	1776	{
	1777	const UInt sampleIndex = (uiY * uiWidth) + uiX;
	1778	const UInt coefficientIndex = (rotateResidual ? (uiSizeMinus1-sampleIndex) : sampleIndex);
	1779	const Pel currentSample = pcResidual[(uiY * uiStride) + uiX];
	1780	const TCoeff encoderSideDelta = TCoeff(currentSample) - accumulatorValue;
	1781
	1782	Pel reconstructedDelta;
	1783	if ( bLossless )
	1784	{
	1785	pcCoeff[coefficientIndex] = encoderSideDelta;
	1786	reconstructedDelta = (Pel) encoderSideDelta;
	1787	}
	1788	else
	1789	{
	1790	transformSkipQuantOneSample(rTu, compID, encoderSideDelta, pcCoeff, coefficientIndex, cQP, bUseHalfRoundingPoint);
	1791	invTrSkipDeQuantOneSample (rTu, compID, pcCoeff[coefficientIndex], reconstructedDelta, cQP, coefficientIndex);
	1792	}
	1793
	1794	uiAbsSum += abs(pcCoeff[coefficientIndex]);
	1795
	1796	if (mode != RDPCM_OFF)
	1797	{
	1798	accumulatorValue += reconstructedDelta;
	1799	}
	1800	}
[608]	1801	}
[2]	1802	}
	1803
[1313]	1804	Void TComTrQuant::rdpcmNxN ( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, RDPCMMode& rdpcmMode )
[2]	1805	{
[1313]	1806	TComDataCU *pcCU=rTu.getCU();
	1807	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
	1808
	1809	if (!pcCU->isRDPCMEnabled(uiAbsPartIdx) \|\| ((pcCU->getTransformSkip(uiAbsPartIdx, compID) == 0) && !pcCU->getCUTransquantBypass(uiAbsPartIdx)))
[2]	1810	{
[1313]	1811	rdpcmMode = RDPCM_OFF;
	1812	}
	1813	else if ( pcCU->isIntra( uiAbsPartIdx ) )
[2]	1814	{
[1313]	1815	const ChromaFormat chFmt = pcCU->getPic()->getPicYuvOrg()->getChromaFormat();
	1816	const ChannelType chType = toChannelType(compID);
	1817	const UInt uiChPredMode = pcCU->getIntraDir( chType, uiAbsPartIdx );
	1818	const TComSPS *sps=pcCU->getSlice()->getSPS();
	1819	const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
	1820	const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
	1821	const UInt uiChFinalMode = ((chFmt == CHROMA_422) && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
	1822
	1823	if (uiChFinalMode == VER_IDX \|\| uiChFinalMode == HOR_IDX)
[2]	1824	{
[1313]	1825	rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
	1826	applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, uiAbsSum, rdpcmMode );
	1827	}
	1828	else
	1829	{
	1830	rdpcmMode = RDPCM_OFF;
	1831	}
	1832	}
	1833	else // not intra, need to select the best mode
	1834	{
	1835	const UInt uiWidth = rTu.getRect(compID).width;
	1836	const UInt uiHeight = rTu.getRect(compID).height;
	1837
	1838	RDPCMMode bestMode = NUMBER_OF_RDPCM_MODES;
	1839	TCoeff bestAbsSum = std::numeric_limits<TCoeff>::max();
	1840	TCoeff bestCoefficients[MAX_TU_SIZE * MAX_TU_SIZE];
	1841
	1842	for (UInt modeIndex = 0; modeIndex < NUMBER_OF_RDPCM_MODES; modeIndex++)
	1843	{
	1844	const RDPCMMode mode = RDPCMMode(modeIndex);
	1845
	1846	TCoeff currAbsSum = 0;
	1847
	1848	applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, currAbsSum, mode );
	1849
	1850	if (currAbsSum < bestAbsSum)
[2]	1851	{
[1313]	1852	bestMode = mode;
	1853	bestAbsSum = currAbsSum;
	1854	if (mode != RDPCM_OFF)
	1855	{
	1856	memcpy(bestCoefficients, pcCoeff, (uiWidth * uiHeight * sizeof(TCoeff)));
	1857	}
[2]	1858	}
	1859	}
[1313]	1860
	1861	rdpcmMode = bestMode;
	1862	uiAbsSum = bestAbsSum;
	1863
	1864	if (rdpcmMode != RDPCM_OFF) //the TU is re-transformed and quantised if DPCM_OFF is returned, so there is no need to preserve it here
	1865	{
	1866	memcpy(pcCoeff, bestCoefficients, (uiWidth * uiHeight * sizeof(TCoeff)));
	1867	}
[2]	1868	}
[1313]	1869
	1870	pcCU->setExplicitRdpcmModePartRange(rdpcmMode, compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
	1871	}
	1872
	1873	Void TComTrQuant::invRdpcmNxN( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride )
	1874	{
	1875	TComDataCU *pcCU=rTu.getCU();
	1876	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
	1877
	1878	if (pcCU->isRDPCMEnabled( uiAbsPartIdx ) && ((pcCU->getTransformSkip(uiAbsPartIdx, compID ) != 0) \|\| pcCU->getCUTransquantBypass(uiAbsPartIdx)))
[2]	1879	{
[1313]	1880	const UInt uiWidth = rTu.getRect(compID).width;
	1881	const UInt uiHeight = rTu.getRect(compID).height;
	1882
	1883	RDPCMMode rdpcmMode = RDPCM_OFF;
	1884
	1885	if ( pcCU->isIntra( uiAbsPartIdx ) )
[56]	1886	{
[1313]	1887	const ChromaFormat chFmt = pcCU->getPic()->getPicYuvRec()->getChromaFormat();
	1888	const ChannelType chType = toChannelType(compID);
	1889	const UInt uiChPredMode = pcCU->getIntraDir( chType, uiAbsPartIdx );
	1890	const TComSPS *sps=pcCU->getSlice()->getSPS();
	1891	const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
	1892	const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
	1893	const UInt uiChFinalMode = ((chFmt == CHROMA_422) && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
	1894
	1895	if (uiChFinalMode == VER_IDX \|\| uiChFinalMode == HOR_IDX)
	1896	{
	1897	rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
	1898	}
[56]	1899	}
[1313]	1900	else // not intra case
	1901	{
	1902	rdpcmMode = RDPCMMode(pcCU->getExplicitRdpcmMode( compID, uiAbsPartIdx ));
	1903	}
	1904
	1905	const TCoeff pelMin=(TCoeff) std::numeric_limits<Pel>::min();
	1906	const TCoeff pelMax=(TCoeff) std::numeric_limits<Pel>::max();
	1907	if (rdpcmMode == RDPCM_VER)
	1908	{
	1909	for( UInt uiX = 0; uiX < uiWidth; uiX++ )
	1910	{
	1911	Pel *pcCurResidual = pcResidual+uiX;
	1912	TCoeff accumulator = *pcCurResidual; // 32-bit accumulator
	1913	pcCurResidual+=uiStride;
	1914	for( UInt uiY = 1; uiY < uiHeight; uiY++, pcCurResidual+=uiStride )
	1915	{
	1916	accumulator += *(pcCurResidual);
	1917	*pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
	1918	}
	1919	}
	1920	}
	1921	else if (rdpcmMode == RDPCM_HOR)
	1922	{
	1923	for( UInt uiY = 0; uiY < uiHeight; uiY++ )
	1924	{
	1925	Pel pcCurResidual = pcResidual+uiYuiStride;
	1926	TCoeff accumulator = *pcCurResidual;
	1927	pcCurResidual++;
	1928	for( UInt uiX = 1; uiX < uiWidth; uiX++, pcCurResidual++ )
	1929	{
	1930	accumulator += *(pcCurResidual);
	1931	*pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
	1932	}
	1933	}
	1934	}
[2]	1935	}
	1936	}
	1937
	1938	// ------------------------------------------------------------------------------------------------
	1939	// Logical transform
	1940	// ------------------------------------------------------------------------------------------------
	1941
[1313]	1942	/** Wrapper function between HM interface and core NxN forward transform (2D)
	1943	* \param channelBitDepth bit depth of channel
	1944	* \param useDST
[2]	1945	* \param piBlkResi input data (residual)
[1313]	1946	* \param uiStride stride of input residual data
[2]	1947	* \param psCoeff output data (transform coefficients)
[1313]	1948	* \param iWidth transform width
	1949	* \param iHeight transform height
	1950	* \param maxLog2TrDynamicRange
[2]	1951	*/
[1313]	1952	Void TComTrQuant::xT( const Int channelBitDepth, Bool useDST, Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
[2]	1953	{
[1313]	1954	#if MATRIX_MULT
	1955	if( iWidth == iHeight)
	1956	{
	1957	xTr(channelBitDepth, piBlkResi, psCoeff, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
	1958	return;
	1959	}
	1960	#endif
	1961
	1962	TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
	1963	TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
	1964
	1965	for (Int y = 0; y < iHeight; y++)
	1966	{
	1967	for (Int x = 0; x < iWidth; x++)
	1968	{
	1969	block[(y * iWidth) + x] = piBlkResi[(y * uiStride) + x];
[56]	1970	}
[1313]	1971	}
	1972
	1973	xTrMxN( channelBitDepth, block, coeff, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
	1974
	1975	memcpy(psCoeff, coeff, (iWidth * iHeight * sizeof(TCoeff)));
[2]	1976	}
	1977
[1313]	1978	/** Wrapper function between HM interface and core NxN inverse transform (2D)
	1979	* \param channelBitDepth bit depth of channel
	1980	* \param useDST
[2]	1981	* \param plCoef input data (transform coefficients)
	1982	* \param pResidual output data (residual)
	1983	* \param uiStride stride of input residual data
[1313]	1984	* \param iWidth transform width
	1985	* \param iHeight transform height
	1986	* \param maxLog2TrDynamicRange
[2]	1987	*/
[1313]	1988	Void TComTrQuant::xIT( const Int channelBitDepth, Bool useDST, TCoeff* plCoef, Pel* pResidual, UInt uiStride, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
[2]	1989	{
[1313]	1990	#if MATRIX_MULT
	1991	if( iWidth == iHeight )
[56]	1992	{
[1313]	1993	xITr(channelBitDepth, plCoef, pResidual, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
	1994	return;
	1995	}
	1996	#endif
	1997
	1998	TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
	1999	TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
	2000
	2001	memcpy(coeff, plCoef, (iWidth * iHeight * sizeof(TCoeff)));
	2002
	2003	xITrMxN( channelBitDepth, coeff, block, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
	2004
	2005	for (Int y = 0; y < iHeight; y++)
	2006	{
	2007	for (Int x = 0; x < iWidth; x++)
[56]	2008	{
[1313]	2009	pResidual[(y * uiStride) + x] = Pel(block[(y * iWidth) + x]);
[56]	2010	}
	2011	}
[2]	2012	}
[1313]	2013
[608]	2014	/** Wrapper function between HM interface and core 4x4 transform skipping
	2015	* \param piBlkResi input data (residual)
[1313]	2016	* \param uiStride stride of input residual data
[608]	2017	* \param psCoeff output data (transform coefficients)
[1313]	2018	* \param rTu reference to transform data
	2019	* \param component colour component
[608]	2020	*/
[1313]	2021	Void TComTrQuant::xTransformSkip( Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, TComTU &rTu, const ComponentID component )
[608]	2022	{
[1313]	2023	const TComRectangle &rect = rTu.getRect(component);
	2024	const Int width = rect.width;
	2025	const Int height = rect.height;
	2026	const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
	2027	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
	2028
	2029	Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
	2030	if (rTu.getCU()->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
[608]	2031	{
[1313]	2032	iTransformShift = std::max<Int>(0, iTransformShift);
	2033	}
	2034
	2035	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
	2036	const UInt uiSizeMinus1 = (width * height) - 1;
	2037
	2038	if (iTransformShift >= 0)
	2039	{
	2040	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
	2041	{
	2042	for (UInt x = 0; x < width; x++, coefficientIndex++)
[608]	2043	{
[1313]	2044	psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = TCoeff(piBlkResi[(y * uiStride) + x]) << iTransformShift;
[608]	2045	}
	2046	}
	2047	}
[1313]	2048	else //for very high bit depths
[608]	2049	{
[1313]	2050	iTransformShift = -iTransformShift;
	2051	const TCoeff offset = 1 << (iTransformShift - 1);
	2052
	2053	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
	2054	{
	2055	for (UInt x = 0; x < width; x++, coefficientIndex++)
[608]	2056	{
[1313]	2057	psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = (TCoeff(piBlkResi[(y * uiStride) + x]) + offset) >> iTransformShift;
[608]	2058	}
	2059	}
	2060	}
	2061	}
	2062
[1313]	2063	/** Wrapper function between HM interface and core NxN transform skipping
[608]	2064	* \param plCoef input data (coefficients)
	2065	* \param pResidual output data (residual)
	2066	* \param uiStride stride of input residual data
[1313]	2067	* \param rTu reference to transform data
	2068	* \param component colour component ID
[608]	2069	*/
[1313]	2070	Void TComTrQuant::xITransformSkip( TCoeff* plCoef, Pel* pResidual, UInt uiStride, TComTU &rTu, const ComponentID component )
[608]	2071	{
[1313]	2072	const TComRectangle &rect = rTu.getRect(component);
	2073	const Int width = rect.width;
	2074	const Int height = rect.height;
	2075	const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
	2076	#if O0043_BEST_EFFORT_DECODING
	2077	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getStreamBitDepth(toChannelType(component));
	2078	#else
	2079	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
	2080	#endif
	2081
	2082	Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
	2083	if (rTu.getCU()->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
[608]	2084	{
[1313]	2085	iTransformShift = std::max<Int>(0, iTransformShift);
	2086	}
	2087
	2088	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
	2089	const UInt uiSizeMinus1 = (width * height) - 1;
	2090
	2091	if (iTransformShift >= 0)
	2092	{
	2093	const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
	2094
	2095	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
	2096	{
	2097	for (UInt x = 0; x < width; x++, coefficientIndex++)
[608]	2098	{
[1313]	2099	pResidual[(y * uiStride) + x] = Pel((plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] + offset) >> iTransformShift);
	2100	}
[608]	2101	}
	2102	}
[1313]	2103	else //for very high bit depths
[608]	2104	{
[1313]	2105	iTransformShift = -iTransformShift;
	2106
	2107	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
	2108	{
	2109	for (UInt x = 0; x < width; x++, coefficientIndex++)
[608]	2110	{
[1313]	2111	pResidual[(y * uiStride) + x] = Pel(plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] << iTransformShift);
[608]	2112	}
	2113	}
	2114	}
	2115	}
	2116
[2]	2117	/** RDOQ with CABAC
[1313]	2118	* \param rTu reference to transform data
[2]	2119	* \param plSrcCoeff pointer to input buffer
	2120	* \param piDstCoeff reference to pointer to output buffer
[1313]	2121	* \param piArlDstCoeff
[2]	2122	* \param uiAbsSum reference to absolute sum of quantized transform coefficient
[1313]	2123	* \param compID colour component ID
	2124	* \param cQP reference to quantization parameters
	2125
[2]	2126	* Rate distortion optimized quantization for entropy
	2127	* coding engines using probability models like CABAC
	2128	*/
[1313]	2129	Void TComTrQuant::xRateDistOptQuant ( TComTU &rTu,
	2130	TCoeff * plSrcCoeff,
	2131	TCoeff * piDstCoeff,
[56]	2132	#if ADAPTIVE_QP_SELECTION
[1313]	2133	TCoeff * piArlDstCoeff,
[56]	2134	#endif
[1313]	2135	TCoeff &uiAbsSum,
	2136	const ComponentID compID,
	2137	const QpParam &cQP )
[2]	2138	{
[1313]	2139	const TComRectangle & rect = rTu.getRect(compID);
	2140	const UInt uiWidth = rect.width;
	2141	const UInt uiHeight = rect.height;
	2142	TComDataCU * pcCU = rTu.getCU();
	2143	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	2144	const ChannelType channelType = toChannelType(compID);
	2145	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
	2146
	2147	const Bool extendedPrecision = pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
	2148	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	2149	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(channelType);
	2150
	2151	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
	2152	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
	2153	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
	2154	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
	2155	*/
	2156
	2157	// Represents scaling through forward transform
	2158	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
	2159	if ((pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && extendedPrecision)
	2160	{
	2161	iTransformShift = std::max<Int>(0, iTransformShift);
	2162	}
	2163
	2164	const Bool bUseGolombRiceParameterAdaptation = pcCU->getSlice()->getSPS()->getSpsRangeExtension().getPersistentRiceAdaptationEnabledFlag();
	2165	const UInt initialGolombRiceParameter = m_pcEstBitsSbac->golombRiceAdaptationStatistics[rTu.getGolombRiceStatisticsIndex(compID)] / RExt__GOLOMB_RICE_INCREMENT_DIVISOR;
	2166	UInt uiGoRiceParam = initialGolombRiceParameter;
	2167	Double d64BlockUncodedCost = 0;
	2168	const UInt uiLog2BlockWidth = g_aucConvertToBit[ uiWidth ] + 2;
	2169	const UInt uiLog2BlockHeight = g_aucConvertToBit[ uiHeight ] + 2;
	2170	const UInt uiMaxNumCoeff = uiWidth * uiHeight;
	2171	assert(compID<MAX_NUM_COMPONENT);
	2172
	2173	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
[872]	2174	assert(scalingListType < SCALING_LIST_NUM);
[1313]	2175
[56]	2176	#if ADAPTIVE_QP_SELECTION
[1313]	2177	memset(piArlDstCoeff, 0, sizeof(TCoeff) * uiMaxNumCoeff);
	2178	#endif
	2179
	2180	Double pdCostCoeff [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2181	Double pdCostSig [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2182	Double pdCostCoeff0[ MAX_TU_SIZE * MAX_TU_SIZE ];
	2183	memset( pdCostCoeff, 0, sizeof(Double) * uiMaxNumCoeff );
	2184	memset( pdCostSig, 0, sizeof(Double) * uiMaxNumCoeff );
	2185	Int rateIncUp [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2186	Int rateIncDown [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2187	Int sigRateDelta[ MAX_TU_SIZE * MAX_TU_SIZE ];
	2188	TCoeff deltaU [ MAX_TU_SIZE * MAX_TU_SIZE ];
	2189	memset( rateIncUp, 0, sizeof(Int ) * uiMaxNumCoeff );
	2190	memset( rateIncDown, 0, sizeof(Int ) * uiMaxNumCoeff );
	2191	memset( sigRateDelta, 0, sizeof(Int ) * uiMaxNumCoeff );
	2192	memset( deltaU, 0, sizeof(TCoeff) * uiMaxNumCoeff );
	2193
	2194	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift; // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits
	2195	const Double *const pdErrScale = getErrScaleCoeff(scalingListType, (uiLog2TrSize-2), cQP.rem);
	2196	const Int *const piQCoef = getQuantCoeff(scalingListType, cQP.rem, (uiLog2TrSize-2));
	2197
	2198	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
	2199	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
	2200	const Double defaultErrorScale = getErrScaleCoeffNoScalingList(scalingListType, (uiLog2TrSize-2), cQP.rem);
	2201
	2202	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
	2203	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
	2204
	2205	#if ADAPTIVE_QP_SELECTION
[56]	2206	Int iQBitsC = iQBits - ARL_C_PRECISION;
	2207	Int iAddC = 1 << (iQBitsC-1);
	2208	#endif
[1313]	2209
	2210	TUEntropyCodingParameters codingParameters;
	2211	getTUEntropyCodingParameters(codingParameters, rTu, compID);
	2212	const UInt uiCGSize = (1 << MLS_CG_SIZE);
	2213
[56]	2214	Double pdCostCoeffGroupSig[ MLS_GRP_NUM ];
	2215	UInt uiSigCoeffGroupFlag[ MLS_GRP_NUM ];
	2216	Int iCGLastScanPos = -1;
[1313]	2217
[56]	2218	UInt uiCtxSet = 0;
	2219	Int c1 = 1;
	2220	Int c2 = 0;
	2221	Double d64BaseCost = 0;
	2222	Int iLastScanPos = -1;
[1313]	2223
[56]	2224	UInt c1Idx = 0;
	2225	UInt c2Idx = 0;
	2226	Int baseLevel;
[1313]	2227
	2228	memset( pdCostCoeffGroupSig, 0, sizeof(Double) * MLS_GRP_NUM );
	2229	memset( uiSigCoeffGroupFlag, 0, sizeof(UInt) * MLS_GRP_NUM );
	2230
[608]	2231	UInt uiCGNum = uiWidth * uiHeight >> MLS_CG_SIZE;
	2232	Int iScanPos;
[1313]	2233	coeffGroupRDStats rdStats;
	2234
	2235	const UInt significanceMapContextOffset = getSignificanceMapContextOffset(compID);
	2236
[608]	2237	for (Int iCGScanPos = uiCGNum-1; iCGScanPos >= 0; iCGScanPos--)
[56]	2238	{
[1313]	2239	UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
	2240	UInt uiCGPosY = uiCGBlkPos / codingParameters.widthInGroups;
	2241	UInt uiCGPosX = uiCGBlkPos - (uiCGPosY * codingParameters.widthInGroups);
	2242
	2243	memset( &rdStats, 0, sizeof (coeffGroupRDStats));
	2244
	2245	const Int patternSigCtx = TComTrQuant::calcPatternSigCtx(uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups);
	2246
[608]	2247	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
[56]	2248	{
[608]	2249	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
	2250	//===== quantization =====
[1313]	2251	UInt uiBlkPos = codingParameters.scan[iScanPos];
[608]	2252	// set coeff
[1313]	2253
	2254	const Int quantisationCoefficient = (enableScalingLists) ? piQCoef [uiBlkPos] : defaultQuantisationCoefficient;
	2255	const Double errorScale = (enableScalingLists) ? pdErrScale[uiBlkPos] : defaultErrorScale;
	2256
	2257	const Int64 tmpLevel = Int64(abs(plSrcCoeff[ uiBlkPos ])) * quantisationCoefficient;
	2258
	2259	const Intermediate_Int lLevelDouble = (Intermediate_Int)min<Int64>(tmpLevel, std::numeric_limits<Intermediate_Int>::max() - (Intermediate_Int(1) << (iQBits - 1)));
	2260
[608]	2261	#if ADAPTIVE_QP_SELECTION
	2262	if( m_bUseAdaptQpSelect )
[2]	2263	{
[1313]	2264	piArlDstCoeff[uiBlkPos] = (TCoeff)(( lLevelDouble + iAddC) >> iQBitsC );
[56]	2265	}
[608]	2266	#endif
[1313]	2267	const UInt uiMaxAbsLevel = std::min<UInt>(UInt(entropyCodingMaximum), UInt((lLevelDouble + (Intermediate_Int(1) << (iQBits - 1))) >> iQBits));
	2268
	2269	const Double dErr = Double( lLevelDouble );
	2270	pdCostCoeff0[ iScanPos ] = dErr * dErr * errorScale;
[608]	2271	d64BlockUncodedCost += pdCostCoeff0[ iScanPos ];
	2272	piDstCoeff[ uiBlkPos ] = uiMaxAbsLevel;
[1313]	2273
[608]	2274	if ( uiMaxAbsLevel > 0 && iLastScanPos < 0 )
	2275	{
	2276	iLastScanPos = iScanPos;
[1313]	2277	uiCtxSet = getContextSetIndex(compID, (iScanPos >> MLS_CG_SIZE), 0);
[608]	2278	iCGLastScanPos = iCGScanPos;
	2279	}
[1313]	2280
[608]	2281	if ( iLastScanPos >= 0 )
	2282	{
	2283	//===== coefficient level estimation =====
	2284	UInt uiLevel;
[1313]	2285	UInt uiOneCtx = (NUM_ONE_FLAG_CTX_PER_SET * uiCtxSet) + c1;
	2286	UInt uiAbsCtx = (NUM_ABS_FLAG_CTX_PER_SET * uiCtxSet) + c2;
	2287
[608]	2288	if( iScanPos == iLastScanPos )
[2]	2289	{
[1313]	2290	uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
	2291	lLevelDouble, uiMaxAbsLevel, significanceMapContextOffset, uiOneCtx, uiAbsCtx, uiGoRiceParam,
	2292	c1Idx, c2Idx, iQBits, errorScale, 1, extendedPrecision, maxLog2TrDynamicRange
	2293	);
[2]	2294	}
[608]	2295	else
[2]	2296	{
[1313]	2297	UShort uiCtxSig = significanceMapContextOffset + getSigCtxInc( patternSigCtx, codingParameters, iScanPos, uiLog2BlockWidth, uiLog2BlockHeight, channelType );
	2298
[608]	2299	uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
[1313]	2300	lLevelDouble, uiMaxAbsLevel, uiCtxSig, uiOneCtx, uiAbsCtx, uiGoRiceParam,
	2301	c1Idx, c2Idx, iQBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange
	2302	);
	2303
[608]	2304	sigRateDelta[ uiBlkPos ] = m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 1 ] - m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 0 ];
[2]	2305	}
[1313]	2306
	2307	deltaU[ uiBlkPos ] = TCoeff((lLevelDouble - (Intermediate_Int(uiLevel) << iQBits)) >> (iQBits-8));
	2308
[608]	2309	if( uiLevel > 0 )
[2]	2310	{
[1313]	2311	Int rateNow = xGetICRate( uiLevel, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange );
	2312	rateIncUp [ uiBlkPos ] = xGetICRate( uiLevel+1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
	2313	rateIncDown [ uiBlkPos ] = xGetICRate( uiLevel-1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
[608]	2314	}
	2315	else // uiLevel == 0
	2316	{
	2317	rateIncUp [ uiBlkPos ] = m_pcEstBitsSbac->m_greaterOneBits[ uiOneCtx ][ 0 ];
	2318	}
	2319	piDstCoeff[ uiBlkPos ] = uiLevel;
	2320	d64BaseCost += pdCostCoeff [ iScanPos ];
[1313]	2321
[608]	2322	baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
	2323	if( uiLevel >= baseLevel )
	2324	{
[1313]	2325	if (uiLevel > 3*(1<<uiGoRiceParam))
[2]	2326	{
[1313]	2327	uiGoRiceParam = bUseGolombRiceParameterAdaptation ? (uiGoRiceParam + 1) : (std::min<UInt>((uiGoRiceParam + 1), 4));
[56]	2328	}
[2]	2329	}
[608]	2330	if ( uiLevel >= 1)
[56]	2331	{
[608]	2332	c1Idx ++;
[56]	2333	}
[1313]	2334
[608]	2335	//===== update bin model =====
	2336	if( uiLevel > 1 )
[56]	2337	{
[1313]	2338	c1 = 0;
[608]	2339	c2 += (c2 < 2);
	2340	c2Idx ++;
[56]	2341	}
[608]	2342	else if( (c1 < 3) && (c1 > 0) && uiLevel)
[56]	2343	{
[608]	2344	c1++;
	2345	}
[1313]	2346
[608]	2347	//===== context set update =====
[1313]	2348	if( ( iScanPos % uiCGSize == 0 ) && ( iScanPos > 0 ) )
[608]	2349	{
[1313]	2350	uiCtxSet = getContextSetIndex(compID, ((iScanPos - 1) >> MLS_CG_SIZE), (c1 == 0)); //(iScanPos - 1) because we do this before entering the final group
	2351	c1 = 1;
[608]	2352	c2 = 0;
[1313]	2353	c1Idx = 0;
	2354	c2Idx = 0;
	2355	uiGoRiceParam = initialGolombRiceParameter;
[56]	2356	}
[608]	2357	}
	2358	else
[2]	2359	{
[608]	2360	d64BaseCost += pdCostCoeff0[ iScanPos ];
	2361	}
	2362	rdStats.d64SigCost += pdCostSig[ iScanPos ];
	2363	if (iScanPosinCG == 0 )
	2364	{
	2365	rdStats.d64SigCost_0 = pdCostSig[ iScanPos ];
	2366	}
	2367	if (piDstCoeff[ uiBlkPos ] )
	2368	{
	2369	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 1;
	2370	rdStats.d64CodedLevelandDist += pdCostCoeff[ iScanPos ] - pdCostSig[ iScanPos ];
	2371	rdStats.d64UncodedDist += pdCostCoeff0[ iScanPos ];
	2372	if ( iScanPosinCG != 0 )
[2]	2373	{
[608]	2374	rdStats.iNNZbeforePos0++;
	2375	}
	2376	}
	2377	} //end for (iScanPosinCG)
[1313]	2378
	2379	if (iCGLastScanPos >= 0)
[608]	2380	{
	2381	if( iCGScanPos )
	2382	{
	2383	if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
	2384	{
[1313]	2385	UInt uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
	2386	d64BaseCost += xGetRateSigCoeffGroup(0, uiCtxSig) - rdStats.d64SigCost;;
	2387	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
	2388	}
[608]	2389	else
	2390	{
	2391	if (iCGScanPos < iCGLastScanPos) //skip the last coefficient group, which will be handled together with last position below.
[2]	2392	{
[1313]	2393	if ( rdStats.iNNZbeforePos0 == 0 )
[56]	2394	{
[608]	2395	d64BaseCost -= rdStats.d64SigCost_0;
	2396	rdStats.d64SigCost -= rdStats.d64SigCost_0;
	2397	}
	2398	// rd-cost if SigCoeffGroupFlag = 0, initialization
	2399	Double d64CostZeroCG = d64BaseCost;
[1313]	2400
[608]	2401	// add SigCoeffGroupFlag cost to total cost
[1313]	2402	UInt uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
	2403
[608]	2404	if (iCGScanPos < iCGLastScanPos)
	2405	{
[1313]	2406	d64BaseCost += xGetRateSigCoeffGroup(1, uiCtxSig);
	2407	d64CostZeroCG += xGetRateSigCoeffGroup(0, uiCtxSig);
	2408	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(1, uiCtxSig);
[608]	2409	}
[1313]	2410
[608]	2411	// try to convert the current coeff group from non-zero to all-zero
	2412	d64CostZeroCG += rdStats.d64UncodedDist; // distortion for resetting non-zero levels to zero levels
	2413	d64CostZeroCG -= rdStats.d64CodedLevelandDist; // distortion and level cost for keeping all non-zero levels
	2414	d64CostZeroCG -= rdStats.d64SigCost; // sig cost for all coeffs, including zero levels and non-zerl levels
[1313]	2415
[608]	2416	// if we can save cost, change this block to all-zero block
[1313]	2417	if ( d64CostZeroCG < d64BaseCost )
[608]	2418	{
	2419	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 0;
	2420	d64BaseCost = d64CostZeroCG;
[56]	2421	if (iCGScanPos < iCGLastScanPos)
	2422	{
[1313]	2423	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
[56]	2424	}
[1313]	2425	// reset coeffs to 0 in this block
[608]	2426	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
[56]	2427	{
[608]	2428	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
[1313]	2429	UInt uiBlkPos = codingParameters.scan[ iScanPos ];
	2430
[608]	2431	if (piDstCoeff[ uiBlkPos ])
[56]	2432	{
[608]	2433	piDstCoeff [ uiBlkPos ] = 0;
	2434	pdCostCoeff[ iScanPos ] = pdCostCoeff0[ iScanPos ];
	2435	pdCostSig [ iScanPos ] = 0;
[56]	2436	}
[608]	2437	}
[1313]	2438	} // end if ( d64CostAllZeros < d64BaseCost )
[608]	2439	}
	2440	} // end if if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
[2]	2441	}
[608]	2442	else
	2443	{
	2444	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 1;
	2445	}
	2446	}
	2447	} //end for (iCGScanPos)
[1313]	2448
[56]	2449	//===== estimate last position =====
	2450	if ( iLastScanPos < 0 )
	2451	{
	2452	return;
	2453	}
[1313]	2454
[56]	2455	Double d64BestCost = 0;
	2456	Int ui16CtxCbf = 0;
	2457	Int iBestLastIdxP1 = 0;
[1313]	2458	if( !pcCU->isIntra( uiAbsPartIdx ) && isLuma(compID) && pcCU->getTransformIdx( uiAbsPartIdx ) == 0 )
[2]	2459	{
[56]	2460	ui16CtxCbf = 0;
	2461	d64BestCost = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockRootCbpBits[ ui16CtxCbf ][ 0 ] );
	2462	d64BaseCost += xGetICost( m_pcEstBitsSbac->blockRootCbpBits[ ui16CtxCbf ][ 1 ] );
[2]	2463	}
	2464	else
	2465	{
[1313]	2466	ui16CtxCbf = pcCU->getCtxQtCbf( rTu, channelType );
	2467	ui16CtxCbf += getCBFContextOffset(compID);
[56]	2468	d64BestCost = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 0 ] );
	2469	d64BaseCost += xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 1 ] );
[2]	2470	}
[1313]	2471
	2472
[608]	2473	Bool bFoundLast = false;
	2474	for (Int iCGScanPos = iCGLastScanPos; iCGScanPos >= 0; iCGScanPos--)
	2475	{
[1313]	2476	UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
	2477
	2478	d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ];
[608]	2479	if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
[1313]	2480	{
[608]	2481	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
	2482	{
	2483	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
[1313]	2484
	2485	if (iScanPos > iLastScanPos)
	2486	{
	2487	continue;
	2488	}
	2489	UInt uiBlkPos = codingParameters.scan[iScanPos];
	2490
[608]	2491	if( piDstCoeff[ uiBlkPos ] )
[56]	2492	{
[1313]	2493	UInt uiPosY = uiBlkPos >> uiLog2BlockWidth;
	2494	UInt uiPosX = uiBlkPos - ( uiPosY << uiLog2BlockWidth );
	2495
	2496	Double d64CostLast= codingParameters.scanType == SCAN_VER ? xGetRateLast( uiPosY, uiPosX, compID ) : xGetRateLast( uiPosX, uiPosY, compID );
[608]	2497	Double totalCost = d64BaseCost + d64CostLast - pdCostSig[ iScanPos ];
[1313]	2498
[608]	2499	if( totalCost < d64BestCost )
[56]	2500	{
[608]	2501	iBestLastIdxP1 = iScanPos + 1;
	2502	d64BestCost = totalCost;
[56]	2503	}
[608]	2504	if( piDstCoeff[ uiBlkPos ] > 1 )
[56]	2505	{
[608]	2506	bFoundLast = true;
	2507	break;
[56]	2508	}
[608]	2509	d64BaseCost -= pdCostCoeff[ iScanPos ];
	2510	d64BaseCost += pdCostCoeff0[ iScanPos ];
	2511	}
	2512	else
[56]	2513	{
[608]	2514	d64BaseCost -= pdCostSig[ iScanPos ];
[56]	2515	}
[1313]	2516	} //end for
[608]	2517	if (bFoundLast)
	2518	{
	2519	break;
	2520	}
	2521	} // end if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
[1313]	2522	} // end for
	2523
	2524
[56]	2525	for ( Int scanPos = 0; scanPos < iBestLastIdxP1; scanPos++ )
	2526	{
[1313]	2527	Int blkPos = codingParameters.scan[ scanPos ];
	2528	TCoeff level = piDstCoeff[ blkPos ];
[56]	2529	uiAbsSum += level;
	2530	piDstCoeff[ blkPos ] = ( plSrcCoeff[ blkPos ] < 0 ) ? -level : level;
[2]	2531	}
[1313]	2532
[2]	2533	//===== clean uncoded coefficients =====
[56]	2534	for ( Int scanPos = iBestLastIdxP1; scanPos <= iLastScanPos; scanPos++ )
[2]	2535	{
[1313]	2536	piDstCoeff[ codingParameters.scan[ scanPos ] ] = 0;
[56]	2537	}
[1313]	2538
	2539
[56]	2540	if( pcCU->getSlice()->getPPS()->getSignHideFlag() && uiAbsSum>=2)
	2541	{
[1313]	2542	const Double inverseQuantScale = Double(g_invQuantScales[cQP.rem]);
	2543	Int64 rdFactor = (Int64)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per))
	2544	/ m_dLambda / 16 / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth - 8)))
	2545	+ 0.5);
	2546
[56]	2547	Int lastCG = -1;
	2548	Int absSum = 0 ;
	2549	Int n ;
[1313]	2550
	2551	for( Int subSet = (uiWidth*uiHeight-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
[2]	2552	{
[1313]	2553	Int subPos = subSet << MLS_CG_SIZE;
	2554	Int firstNZPosInCG=uiCGSize , lastNZPosInCG=-1 ;
[56]	2555	absSum = 0 ;
[1313]	2556
	2557	for(n = uiCGSize-1; n >= 0; --n )
[56]	2558	{
[1313]	2559	if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
[56]	2560	{
	2561	lastNZPosInCG = n;
	2562	break;
	2563	}
	2564	}
[1313]	2565
	2566	for(n = 0; n <uiCGSize; n++ )
[56]	2567	{
[1313]	2568	if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
[56]	2569	{
	2570	firstNZPosInCG = n;
	2571	break;
	2572	}
	2573	}
[1313]	2574
[56]	2575	for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
	2576	{
[1313]	2577	absSum += Int(piDstCoeff[ codingParameters.scan[ n + subPos ]]);
[56]	2578	}
[1313]	2579
[608]	2580	if(lastNZPosInCG>=0 && lastCG==-1)
[2]	2581	{
[1313]	2582	lastCG = 1;
	2583	}
	2584
[608]	2585	if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
	2586	{
[1313]	2587	UInt signbit = (piDstCoeff[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1);
[56]	2588	if( signbit!=(absSum&0x1) ) // hide but need tune
	2589	{
[1313]	2590	// calculate the cost
	2591	Int64 minCostInc = std::numeric_limits<Int64>::max(), curCost = std::numeric_limits<Int64>::max();
	2592	Int minPos = -1, finalChange = 0, curChange = 0;
	2593
	2594	for( n = (lastCG==1?lastNZPosInCG:uiCGSize-1) ; n >= 0; --n )
[56]	2595	{
[1313]	2596	UInt uiBlkPos = codingParameters.scan[ n + subPos ];
[56]	2597	if(piDstCoeff[ uiBlkPos ] != 0 )
	2598	{
[1313]	2599	Int64 costUp = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos];
	2600	Int64 costDown = rdFactor * ( deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos]
	2601	- ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);
	2602
[56]	2603	if(lastCG==1 && lastNZPosInCG==n && abs(piDstCoeff[uiBlkPos])==1)
	2604	{
[1313]	2605	costDown -= (4<<15);
[56]	2606	}
[1313]	2607
[56]	2608	if(costUp<costDown)
[1313]	2609	{
[56]	2610	curCost = costUp;
[1313]	2611	curChange = 1;
[56]	2612	}
[1313]	2613	else
[56]	2614	{
[1313]	2615	curChange = -1;
[56]	2616	if(n==firstNZPosInCG && abs(piDstCoeff[uiBlkPos])==1)
	2617	{
[1313]	2618	curCost = std::numeric_limits<Int64>::max();
[56]	2619	}
	2620	else
	2621	{
[1313]	2622	curCost = costDown;
[56]	2623	}
	2624	}
	2625	}
	2626	else
	2627	{
[1313]	2628	curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<15) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ;
[56]	2629	curChange = 1 ;
[1313]	2630
[56]	2631	if(n<firstNZPosInCG)
	2632	{
	2633	UInt thissignbit = (plSrcCoeff[uiBlkPos]>=0?0:1);
	2634	if(thissignbit != signbit )
	2635	{
[1313]	2636	curCost = std::numeric_limits<Int64>::max();
[56]	2637	}
	2638	}
	2639	}
[1313]	2640
[56]	2641	if( curCost<minCostInc)
	2642	{
[1313]	2643	minCostInc = curCost;
	2644	finalChange = curChange;
	2645	minPos = uiBlkPos;
[56]	2646	}
	2647	}
[1313]	2648
	2649	if(piDstCoeff[minPos] == entropyCodingMaximum \|\| piDstCoeff[minPos] == entropyCodingMinimum)
[56]	2650	{
	2651	finalChange = -1;
	2652	}
[1313]	2653
[56]	2654	if(plSrcCoeff[minPos]>=0)
	2655	{
	2656	piDstCoeff[minPos] += finalChange ;
	2657	}
	2658	else
	2659	{
[1313]	2660	piDstCoeff[minPos] -= finalChange ;
	2661	}
[56]	2662	}
[2]	2663	}
[1313]	2664
[56]	2665	if(lastCG==1)
[2]	2666	{
[1313]	2667	lastCG=0 ;
[56]	2668	}
[2]	2669	}
	2670	}
	2671	}
	2672
[1313]	2673
[608]	2674	/** Pattern decision for context derivation process of significant_coeff_flag
	2675	* \param sigCoeffGroupFlag pointer to prior coded significant coeff group
[1313]	2676	* \param uiCGPosX column of current coefficient group
	2677	* \param uiCGPosY row of current coefficient group
	2678	* \param widthInGroups width of the block
	2679	* \param heightInGroups height of the block
[608]	2680	* \returns pattern for current coefficient group
	2681	*/
[1313]	2682	Int TComTrQuant::calcPatternSigCtx( const UInt* sigCoeffGroupFlag, UInt uiCGPosX, UInt uiCGPosY, UInt widthInGroups, UInt heightInGroups )
[608]	2683	{
[1313]	2684	if ((widthInGroups <= 1) && (heightInGroups <= 1))
	2685	{
	2686	return 0;
	2687	}
[608]	2688
[1313]	2689	const Bool rightAvailable = uiCGPosX < (widthInGroups - 1);
	2690	const Bool belowAvailable = uiCGPosY < (heightInGroups - 1);
	2691
[608]	2692	UInt sigRight = 0;
	2693	UInt sigLower = 0;
	2694
[1313]	2695	if (rightAvailable)
[608]	2696	{
[1313]	2697	sigRight = ((sigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
[608]	2698	}
[1313]	2699	if (belowAvailable)
[608]	2700	{
[1313]	2701	sigLower = ((sigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
[608]	2702	}
[1313]	2703
	2704	return sigRight + (sigLower << 1);
[608]	2705	}
	2706
[1313]	2707
[2]	2708	/** Context derivation process of coeff_abs_significant_flag
[608]	2709	* \param patternSigCtx pattern for current coefficient group
[1313]	2710	* \param codingParameters coding parameters for the TU (includes the scan)
	2711	* \param scanPosition current position in scan order
	2712	* \param log2BlockWidth log2 width of the block
	2713	* \param log2BlockHeight log2 height of the block
	2714	* \param chanType channel type (CHANNEL_TYPE_LUMA/CHROMA)
[2]	2715	* \returns ctxInc for current scan position
	2716	*/
[1313]	2717	Int TComTrQuant::getSigCtxInc ( Int patternSigCtx,
	2718	const TUEntropyCodingParameters &codingParameters,
	2719	const Int scanPosition,
	2720	const Int log2BlockWidth,
	2721	const Int log2BlockHeight,
	2722	const ChannelType chanType)
[2]	2723	{
[1313]	2724	if (codingParameters.firstSignificanceMapContext == significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE])
[2]	2725	{
[1313]	2726	//single context mode
	2727	return significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE];
[2]	2728	}
[608]	2729
[1313]	2730	const UInt rasterPosition = codingParameters.scan[scanPosition];
	2731	const UInt posY = rasterPosition >> log2BlockWidth;
	2732	const UInt posX = rasterPosition - (posY << log2BlockWidth);
	2733
	2734	if ((posX + posY) == 0)
[2]	2735	{
[1313]	2736	return 0; //special case for the DC context variable
[2]	2737	}
[56]	2738
[1313]	2739	Int offset = MAX_INT;
[608]	2740
[1313]	2741	if ((log2BlockWidth == 2) && (log2BlockHeight == 2)) //4x4
[2]	2742	{
[1313]	2743	offset = ctxIndMap4x4[ (4 * posY) + posX ];
[2]	2744	}
[608]	2745	else
[2]	2746	{
[1313]	2747	Int cnt = 0;
	2748
	2749	switch (patternSigCtx)
	2750	{
	2751	//------------------
	2752
	2753	case 0: //neither neighbouring group is significant
	2754	{
	2755	const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH) - 1);
	2756	const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
	2757	const Int posTotalInSubset = posXinSubset + posYinSubset;
	2758
	2759	//first N coefficients in scan order use 2; the next few use 1; the rest use 0.
	2760	const UInt context1Threshold = NEIGHBOURHOOD_00_CONTEXT_1_THRESHOLD_4x4;
	2761	const UInt context2Threshold = NEIGHBOURHOOD_00_CONTEXT_2_THRESHOLD_4x4;
	2762
	2763	cnt = (posTotalInSubset >= context1Threshold) ? 0 : ((posTotalInSubset >= context2Threshold) ? 1 : 2);
	2764	}
	2765	break;
	2766
	2767	//------------------
	2768
	2769	case 1: //right group is significant, below is not
	2770	{
	2771	const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
	2772	const Int groupHeight = 1 << MLS_CG_LOG2_HEIGHT;
	2773
	2774	cnt = (posYinSubset >= (groupHeight >> 1)) ? 0 : ((posYinSubset >= (groupHeight >> 2)) ? 1 : 2); //top quarter uses 2; second-from-top quarter uses 1; bottom half uses 0
	2775	}
	2776	break;
	2777
	2778	//------------------
	2779
	2780	case 2: //below group is significant, right is not
	2781	{
	2782	const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH) - 1);
	2783	const Int groupWidth = 1 << MLS_CG_LOG2_WIDTH;
	2784
	2785	cnt = (posXinSubset >= (groupWidth >> 1)) ? 0 : ((posXinSubset >= (groupWidth >> 2)) ? 1 : 2); //left quarter uses 2; second-from-left quarter uses 1; right half uses 0
	2786	}
	2787	break;
	2788
	2789	//------------------
	2790
	2791	case 3: //both neighbouring groups are significant
	2792	{
	2793	cnt = 2;
	2794	}
	2795	break;
	2796
	2797	//------------------
	2798
	2799	default:
	2800	std::cerr << "ERROR: Invalid patternSigCtx \"" << Int(patternSigCtx) << "\" in getSigCtxInc" << std::endl;
	2801	exit(1);
	2802	break;
	2803	}
	2804
	2805	//------------------------------------------------
	2806
	2807	const Bool notFirstGroup = ((posX >> MLS_CG_LOG2_WIDTH) + (posY >> MLS_CG_LOG2_HEIGHT)) > 0;
	2808
	2809	offset = (notFirstGroup ? notFirstGroupNeighbourhoodContextOffset[chanType] : 0) + cnt;
[2]	2810	}
[56]	2811
[1313]	2812	return codingParameters.firstSignificanceMapContext + offset;
[2]	2813	}
	2814
[1313]	2815
[2]	2816	/** Get the best level in RD sense
[1313]	2817	*
[2]	2818	* \returns best quantized transform level for given scan position
[1313]	2819	*
[2]	2820	* This method calculates the best quantized transform level for a given scan position.
	2821	*/
[1313]	2822	__inline UInt TComTrQuant::xGetCodedLevel ( Double& rd64CodedCost, //< reference to coded cost
	2823	Double& rd64CodedCost0, //< reference to cost when coefficient is 0
	2824	Double& rd64CodedCostSig, //< rd64CodedCostSig reference to cost of significant coefficient
	2825	Intermediate_Int lLevelDouble, //< reference to unscaled quantized level
	2826	UInt uiMaxAbsLevel, //< scaled quantized level
	2827	UShort ui16CtxNumSig, //< current ctxInc for coeff_abs_significant_flag
	2828	UShort ui16CtxNumOne, //< current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
	2829	UShort ui16CtxNumAbs, //< current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
	2830	UShort ui16AbsGoRice, //< current Rice parameter for coeff_abs_level_minus3
	2831	UInt c1Idx, //<
	2832	UInt c2Idx, //<
	2833	Int iQBits, //< quantization step size
	2834	Double errorScale, //<
	2835	Bool bLast, //< indicates if the coefficient is the last significant
	2836	Bool useLimitedPrefixLength, //<
	2837	const Int maxLog2TrDynamicRange //<
	2838	) const
[2]	2839	{
[1313]	2840	Double dCurrCostSig = 0;
[2]	2841	UInt uiBestAbsLevel = 0;
[1313]	2842
[56]	2843	if( !bLast && uiMaxAbsLevel < 3 )
[2]	2844	{
[1313]	2845	rd64CodedCostSig = xGetRateSigCoef( 0, ui16CtxNumSig );
[56]	2846	rd64CodedCost = rd64CodedCost0 + rd64CodedCostSig;
	2847	if( uiMaxAbsLevel == 0 )
	2848	{
	2849	return uiBestAbsLevel;
	2850	}
[2]	2851	}
	2852	else
	2853	{
[56]	2854	rd64CodedCost = MAX_DOUBLE;
[2]	2855	}
	2856
[56]	2857	if( !bLast )
[2]	2858	{
[56]	2859	dCurrCostSig = xGetRateSigCoef( 1, ui16CtxNumSig );
[2]	2860	}
	2861
[56]	2862	UInt uiMinAbsLevel = ( uiMaxAbsLevel > 1 ? uiMaxAbsLevel - 1 : 1 );
	2863	for( Int uiAbsLevel = uiMaxAbsLevel; uiAbsLevel >= uiMinAbsLevel ; uiAbsLevel-- )
[2]	2864	{
[1313]	2865	Double dErr = Double( lLevelDouble - ( Intermediate_Int(uiAbsLevel) << iQBits ) );
	2866	Double dCurrCost = dErr * dErr * errorScale + xGetICost( xGetICRate( uiAbsLevel, ui16CtxNumOne, ui16CtxNumAbs, ui16AbsGoRice, c1Idx, c2Idx, useLimitedPrefixLength, maxLog2TrDynamicRange ) );
[56]	2867	dCurrCost += dCurrCostSig;
[2]	2868
	2869	if( dCurrCost < rd64CodedCost )
	2870	{
[56]	2871	uiBestAbsLevel = uiAbsLevel;
	2872	rd64CodedCost = dCurrCost;
	2873	rd64CodedCostSig = dCurrCostSig;
[2]	2874	}
	2875	}
[56]	2876
[2]	2877	return uiBestAbsLevel;
	2878	}
	2879
	2880	/** Calculates the cost for specific absolute transform level
	2881	* \param uiAbsLevel scaled quantized level
	2882	* \param ui16CtxNumOne current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
	2883	* \param ui16CtxNumAbs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
	2884	* \param ui16AbsGoRice Rice parameter for coeff_abs_level_minus3
[1313]	2885	* \param c1Idx
	2886	* \param c2Idx
	2887	* \param useLimitedPrefixLength
	2888	* \param maxLog2TrDynamicRange
[2]	2889	* \returns cost of given absolute transform level
	2890	*/
[1313]	2891	__inline Int TComTrQuant::xGetICRate ( const UInt uiAbsLevel,
	2892	const UShort ui16CtxNumOne,
	2893	const UShort ui16CtxNumAbs,
	2894	const UShort ui16AbsGoRice,
	2895	const UInt c1Idx,
	2896	const UInt c2Idx,
	2897	const Bool useLimitedPrefixLength,
	2898	const Int maxLog2TrDynamicRange
[56]	2899	) const
[2]	2900	{
[1313]	2901	Int iRate = Int(xGetIEPRate()); // cost of sign bit
	2902	UInt baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
[56]	2903
	2904	if ( uiAbsLevel >= baseLevel )
[1313]	2905	{
[608]	2906	UInt symbol = uiAbsLevel - baseLevel;
	2907	UInt length;
	2908	if (symbol < (COEF_REMAIN_BIN_REDUCTION << ui16AbsGoRice))
[2]	2909	{
[608]	2910	length = symbol>>ui16AbsGoRice;
	2911	iRate += (length+1+ui16AbsGoRice)<< 15;
[2]	2912	}
[1313]	2913	else if (useLimitedPrefixLength)
	2914	{
	2915	const UInt maximumPrefixLength = (32 - (COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange));
	2916
	2917	UInt prefixLength = 0;
	2918	UInt suffix = (symbol >> ui16AbsGoRice) - COEF_REMAIN_BIN_REDUCTION;
	2919
	2920	while ((prefixLength < maximumPrefixLength) && (suffix > ((2 << prefixLength) - 2)))
	2921	{
	2922	prefixLength++;
	2923	}
	2924
	2925	const UInt suffixLength = (prefixLength == maximumPrefixLength) ? (maxLog2TrDynamicRange - ui16AbsGoRice) : (prefixLength + 1/separator/);
	2926
	2927	iRate += (COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ui16AbsGoRice) << 15;
	2928	}
[608]	2929	else
	2930	{
	2931	length = ui16AbsGoRice;
	2932	symbol = symbol - ( COEF_REMAIN_BIN_REDUCTION << ui16AbsGoRice);
	2933	while (symbol >= (1<<length))
	2934	{
[1313]	2935	symbol -= (1<<(length++));
[608]	2936	}
	2937	iRate += (COEF_REMAIN_BIN_REDUCTION+length+1-ui16AbsGoRice+length)<< 15;
	2938	}
[1313]	2939
[56]	2940	if (c1Idx < C1FLAG_NUMBER)
	2941	{
	2942	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 1 ];
	2943
	2944	if (c2Idx < C2FLAG_NUMBER)
	2945	{
	2946	iRate += m_pcEstBitsSbac->m_levelAbsBits[ ui16CtxNumAbs ][ 1 ];
	2947	}
	2948	}
[2]	2949	}
[1313]	2950	else if( uiAbsLevel == 1 )
[2]	2951	{
[56]	2952	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 0 ];
[2]	2953	}
	2954	else if( uiAbsLevel == 2 )
	2955	{
[56]	2956	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 1 ];
	2957	iRate += m_pcEstBitsSbac->m_levelAbsBits[ ui16CtxNumAbs ][ 0 ];
[2]	2958	}
	2959	else
	2960	{
[872]	2961	iRate = 0;
[2]	2962	}
[1313]	2963
	2964	return iRate;
[2]	2965	}
	2966
[56]	2967	__inline Double TComTrQuant::xGetRateSigCoeffGroup ( UShort uiSignificanceCoeffGroup,
	2968	UShort ui16CtxNumSig ) const
	2969	{
	2970	return xGetICost( m_pcEstBitsSbac->significantCoeffGroupBits[ ui16CtxNumSig ][ uiSignificanceCoeffGroup ] );
	2971	}
	2972
[2]	2973	/** Calculates the cost of signaling the last significant coefficient in the block
	2974	* \param uiPosX X coordinate of the last significant coefficient
	2975	* \param uiPosY Y coordinate of the last significant coefficient
[1313]	2976	* \param component colour component ID
[2]	2977	* \returns cost of last significant coefficient
	2978	*/
[56]	2979	/*
	2980	* \param uiWidth width of the transform unit (TU)
	2981	*/
	2982	__inline Double TComTrQuant::xGetRateLast ( const UInt uiPosX,
[1313]	2983	const UInt uiPosY,
	2984	const ComponentID component ) const
[2]	2985	{
[56]	2986	UInt uiCtxX = g_uiGroupIdx[uiPosX];
	2987	UInt uiCtxY = g_uiGroupIdx[uiPosY];
[1313]	2988
	2989	Double uiCost = m_pcEstBitsSbac->lastXBits[toChannelType(component)][ uiCtxX ] + m_pcEstBitsSbac->lastYBits[toChannelType(component)][ uiCtxY ];
	2990
[56]	2991	if( uiCtxX > 3 )
	2992	{
	2993	uiCost += xGetIEPRate() * ((uiCtxX-2)>>1);
	2994	}
	2995	if( uiCtxY > 3 )
	2996	{
	2997	uiCost += xGetIEPRate() * ((uiCtxY-2)>>1);
	2998	}
	2999	return xGetICost( uiCost );
[2]	3000	}
	3001
	3002	__inline Double TComTrQuant::xGetRateSigCoef ( UShort uiSignificance,
	3003	UShort ui16CtxNumSig ) const
	3004	{
	3005	return xGetICost( m_pcEstBitsSbac->significantBits[ ui16CtxNumSig ][ uiSignificance ] );
	3006	}
	3007
	3008	/** Get the cost for a specific rate
	3009	* \param dRate rate of a bit
	3010	* \returns cost at the specific rate
	3011	*/
	3012	__inline Double TComTrQuant::xGetICost ( Double dRate ) const
	3013	{
	3014	return m_dLambda * dRate;
	3015	}
	3016
	3017	/** Get the cost of an equal probable bit
	3018	* \returns cost of equal probable bit
	3019	*/
	3020	__inline Double TComTrQuant::xGetIEPRate ( ) const
	3021	{
	3022	return 32768;
	3023	}
[56]	3024
	3025	/** Context derivation process of coeff_abs_significant_flag
	3026	* \param uiSigCoeffGroupFlag significance map of L1
[1313]	3027	* \param uiCGPosX column of current scan position
	3028	* \param uiCGPosY row of current scan position
	3029	* \param widthInGroups width of the block
	3030	* \param heightInGroups height of the block
[56]	3031	* \returns ctxInc for current scan position
	3032	*/
[1313]	3033	UInt TComTrQuant::getSigCoeffGroupCtxInc (const UInt* uiSigCoeffGroupFlag,
	3034	const UInt uiCGPosX,
	3035	const UInt uiCGPosY,
	3036	const UInt widthInGroups,
	3037	const UInt heightInGroups)
[56]	3038	{
[1313]	3039	UInt sigRight = 0;
	3040	UInt sigLower = 0;
[56]	3041
[1313]	3042	if (uiCGPosX < (widthInGroups - 1))
[56]	3043	{
[1313]	3044	sigRight = ((uiSigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
[56]	3045	}
[1313]	3046	if (uiCGPosY < (heightInGroups - 1))
[56]	3047	{
[1313]	3048	sigLower = ((uiSigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
[56]	3049	}
	3050
[1313]	3051	return ((sigRight + sigLower) != 0) ? 1 : 0;
[56]	3052	}
[1313]	3053
	3054
[56]	3055	/** set quantized matrix coefficient for encode
[1313]	3056	* \param scalingList quantized matrix address
	3057	* \param format chroma format
	3058	* \param maxLog2TrDynamicRange
	3059	* \param bitDepths reference to bit depth array for all channels
[56]	3060	*/
[1313]	3061	Void TComTrQuant::setScalingList(TComScalingList *scalingList, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
[56]	3062	{
[1313]	3063	const Int minimumQp = 0;
	3064	const Int maximumQp = SCALING_LIST_REM_NUM;
[56]	3065
[1313]	3066	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
[56]	3067	{
[1313]	3068	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
[56]	3069	{
[1313]	3070	for(Int qp = minimumQp; qp < maximumQp; qp++)
[56]	3071	{
	3072	xSetScalingListEnc(scalingList,list,size,qp);
[1313]	3073	xSetScalingListDec(*scalingList,list,size,qp);
	3074	setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
[56]	3075	}
	3076	}
	3077	}
	3078	}
	3079	/** set quantized matrix coefficient for decode
[1313]	3080	* \param scalingList quantized matrix address
	3081	* \param format chroma format
[56]	3082	*/
[1313]	3083	Void TComTrQuant::setScalingListDec(const TComScalingList &scalingList)
[56]	3084	{
[1313]	3085	const Int minimumQp = 0;
	3086	const Int maximumQp = SCALING_LIST_REM_NUM;
[56]	3087
[1313]	3088	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
[56]	3089	{
[1313]	3090	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
[56]	3091	{
[1313]	3092	for(Int qp = minimumQp; qp < maximumQp; qp++)
[56]	3093	{
	3094	xSetScalingListDec(scalingList,list,size,qp);
	3095	}
	3096	}
	3097	}
	3098	}
	3099	/** set error scale coefficients
[1313]	3100	* \param list list ID
	3101	* \param size
	3102	* \param qp quantization parameter
	3103	* \param maxLog2TrDynamicRange
	3104	* \param bitDepths reference to bit depth array for all channels
[56]	3105	*/
[1313]	3106	Void TComTrQuant::setErrScaleCoeff(UInt list, UInt size, Int qp, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
[56]	3107	{
[1313]	3108	const UInt uiLog2TrSize = g_aucConvertToBit[ g_scalingListSizeX[size] ] + 2;
	3109	const ChannelType channelType = ((list == 0) \|\| (list == MAX_NUM_COMPONENT)) ? CHANNEL_TYPE_LUMA : CHANNEL_TYPE_CHROMA;
[56]	3110
[1313]	3111	const Int channelBitDepth = bitDepths.recon[channelType];
	3112	const Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange[channelType]); // Represents scaling through forward transform
[56]	3113
	3114	UInt i,uiMaxNumCoeff = g_scalingListSize[size];
	3115	Int *piQuantcoeff;
[608]	3116	Double *pdErrScale;
	3117	piQuantcoeff = getQuantCoeff(list, qp,size);
	3118	pdErrScale = getErrScaleCoeff(list, size, qp);
[56]	3119
[1313]	3120	Double dErrScale = (Double)(1<<SCALE_BITS); // Compensate for scaling of bitcount in Lagrange cost function
	3121	dErrScale = dErrScalepow(2.0,(-2.0iTransformShift)); // Compensate for scaling through forward transform
	3122
[56]	3123	for(i=0;i<uiMaxNumCoeff;i++)
	3124	{
[1313]	3125	pdErrScale[i] = dErrScale / piQuantcoeff[i] / piQuantcoeff[i] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
[56]	3126	}
[1313]	3127
	3128	getErrScaleCoeffNoScalingList(list, size, qp) = dErrScale / g_quantScales[qp] / g_quantScales[qp] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
[56]	3129	}
	3130
	3131	/** set quantized matrix coefficient for encode
[1313]	3132	* \param scalingList quantized matrix address
[56]	3133	* \param listId List index
	3134	* \param sizeId size index
[1313]	3135	* \param qp Quantization parameter
	3136	* \param format chroma format
[56]	3137	*/
[1313]	3138	Void TComTrQuant::xSetScalingListEnc(TComScalingList *scalingList, UInt listId, UInt sizeId, Int qp)
[56]	3139	{
[1313]	3140	UInt width = g_scalingListSizeX[sizeId];
[56]	3141	UInt height = g_scalingListSizeX[sizeId];
[1313]	3142	UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
[56]	3143	Int *quantcoeff;
[1313]	3144	Int *coeff = scalingList->getScalingListAddress(sizeId,listId);
	3145	quantcoeff = getQuantCoeff(listId, qp, sizeId);
[56]	3146
[1313]	3147	Int quantScales = g_quantScales[qp];
	3148
	3149	processScalingListEnc(coeff,
	3150	quantcoeff,
	3151	(quantScales << LOG2_SCALING_LIST_NEUTRAL_VALUE),
	3152	height, width, ratio,
	3153	min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
	3154	scalingList->getScalingListDC(sizeId,listId));
[56]	3155	}
[1313]	3156
[56]	3157	/** set quantized matrix coefficient for decode
	3158	* \param scalingList quantaized matrix address
[1313]	3159	* \param listId List index
	3160	* \param sizeId size index
	3161	* \param qp Quantization parameter
	3162	* \param format chroma format
[56]	3163	*/
[1313]	3164	Void TComTrQuant::xSetScalingListDec(const TComScalingList &scalingList, UInt listId, UInt sizeId, Int qp)
[56]	3165	{
[1313]	3166	UInt width = g_scalingListSizeX[sizeId];
[56]	3167	UInt height = g_scalingListSizeX[sizeId];
[1313]	3168	UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
[56]	3169	Int *dequantcoeff;
[1313]	3170	const Int *coeff = scalingList.getScalingListAddress(sizeId,listId);
[56]	3171
[608]	3172	dequantcoeff = getDequantCoeff(listId, qp, sizeId);
[1313]	3173
	3174	Int invQuantScale = g_invQuantScales[qp];
	3175
	3176	processScalingListDec(coeff,
	3177	dequantcoeff,
	3178	invQuantScale,
	3179	height, width, ratio,
	3180	min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
	3181	scalingList.getScalingListDC(sizeId,listId));
[56]	3182	}
	3183
	3184	/** set flat matrix value to quantized coefficient
	3185	*/
[1313]	3186	Void TComTrQuant::setFlatScalingList(const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
[56]	3187	{
[1313]	3188	const Int minimumQp = 0;
	3189	const Int maximumQp = SCALING_LIST_REM_NUM;
[56]	3190
[1313]	3191	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
[56]	3192	{
[1313]	3193	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
[56]	3194	{
[1313]	3195	for(Int qp = minimumQp; qp < maximumQp; qp++)
[56]	3196	{
	3197	xsetFlatScalingList(list,size,qp);
[1313]	3198	setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
[56]	3199	}
	3200	}
	3201	}
	3202	}
	3203
	3204	/** set flat matrix value to quantized coefficient
	3205	* \param list List ID
[1313]	3206	* \param size size index
	3207	* \param qp Quantization parameter
	3208	* \param format chroma format
[56]	3209	*/
[1313]	3210	Void TComTrQuant::xsetFlatScalingList(UInt list, UInt size, Int qp)
[56]	3211	{
	3212	UInt i,num = g_scalingListSize[size];
	3213	Int *quantcoeff;
	3214	Int *dequantcoeff;
	3215
[1313]	3216	Int quantScales = g_quantScales [qp];
	3217	Int invQuantScales = g_invQuantScales[qp] << 4;
	3218
[608]	3219	quantcoeff = getQuantCoeff(list, qp, size);
	3220	dequantcoeff = getDequantCoeff(list, qp, size);
[56]	3221
	3222	for(i=0;i<num;i++)
[1313]	3223	{
[56]	3224	*quantcoeff++ = quantScales;
	3225	*dequantcoeff++ = invQuantScales;
	3226	}
	3227	}
	3228
	3229	/** set quantized matrix coefficient for encode
	3230	* \param coeff quantaized matrix address
	3231	* \param quantcoeff quantaized matrix address
	3232	* \param quantScales Q(QP%6)
	3233	* \param height height
	3234	* \param width width
	3235	* \param ratio ratio for upscale
	3236	* \param sizuNum matrix size
	3237	* \param dc dc parameter
	3238	*/
	3239	Void TComTrQuant::processScalingListEnc( Int coeff, Int quantcoeff, Int quantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
	3240	{
	3241	for(UInt j=0;j<height;j++)
	3242	{
	3243	for(UInt i=0;i<width;i++)
	3244	{
[1313]	3245	quantcoeff[jwidth + i] = quantScales / coeff[sizuNum (j / ratio) + i / ratio];
[56]	3246	}
	3247	}
[1313]	3248
[56]	3249	if(ratio > 1)
	3250	{
	3251	quantcoeff[0] = quantScales / dc;
	3252	}
	3253	}
[1313]	3254
[56]	3255	/** set quantized matrix coefficient for decode
	3256	* \param coeff quantaized matrix address
	3257	* \param dequantcoeff quantaized matrix address
	3258	* \param invQuantScales IQ(QP%6))
	3259	* \param height height
	3260	* \param width width
	3261	* \param ratio ratio for upscale
	3262	* \param sizuNum matrix size
	3263	* \param dc dc parameter
	3264	*/
[1313]	3265	Void TComTrQuant::processScalingListDec( const Int coeff, Int dequantcoeff, Int invQuantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
[56]	3266	{
	3267	for(UInt j=0;j<height;j++)
	3268	{
	3269	for(UInt i=0;i<width;i++)
	3270	{
[608]	3271	dequantcoeff[jwidth + i] = invQuantScales coeff[sizuNum * (j / ratio) + i / ratio];
[56]	3272	}
	3273	}
[1313]	3274
[56]	3275	if(ratio > 1)
	3276	{
	3277	dequantcoeff[0] = invQuantScales * dc;
	3278	}
	3279	}
	3280
	3281	/** initialization process of scaling list array
	3282	*/
	3283	Void TComTrQuant::initScalingList()
	3284	{
	3285	for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
	3286	{
[1313]	3287	for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
[56]	3288	{
[1313]	3289	for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
[56]	3290	{
[1313]	3291	m_quantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
	3292	m_dequantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
[608]	3293	m_errScale [sizeId][listId][qp] = new Double [g_scalingListSize[sizeId]];
[1313]	3294	} // listID loop
[56]	3295	}
	3296	}
	3297	}
[1313]	3298
[56]	3299	/** destroy quantization matrix array
	3300	*/
	3301	Void TComTrQuant::destroyScalingList()
	3302	{
	3303	for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
	3304	{
[1313]	3305	for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
[56]	3306	{
	3307	for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
	3308	{
[1313]	3309	if(m_quantCoef[sizeId][listId][qp])
	3310	{
	3311	delete [] m_quantCoef[sizeId][listId][qp];
	3312	}
	3313	if(m_dequantCoef[sizeId][listId][qp])
	3314	{
	3315	delete [] m_dequantCoef[sizeId][listId][qp];
	3316	}
	3317	if(m_errScale[sizeId][listId][qp])
	3318	{
	3319	delete [] m_errScale[sizeId][listId][qp];
	3320	}
[56]	3321	}
	3322	}
	3323	}
	3324	}
	3325
[1313]	3326	Void TComTrQuant::transformSkipQuantOneSample(TComTU &rTu, const ComponentID compID, const TCoeff resiDiff, TCoeff* pcCoeff, const UInt uiPos, const QpParam &cQP, const Bool bUseHalfRoundingPoint)
	3327	{
	3328	TComDataCU *pcCU = rTu.getCU();
	3329	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	3330	const TComRectangle &rect = rTu.getRect(compID);
	3331	const UInt uiWidth = rect.width;
	3332	const UInt uiHeight = rect.height;
	3333	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	3334	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	3335	const Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
	3336	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	3337	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
	3338	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
	3339
	3340	assert( scalingListType < SCALING_LIST_NUM );
	3341	const Int *const piQuantCoeff = getQuantCoeff( scalingListType, cQP.rem, (rTu.GetEquivalentLog2TrSize(compID)-2) );
	3342
	3343
	3344	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
	3345	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
	3346	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
	3347	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
	3348	*/
	3349
	3350	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
	3351	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
	3352
	3353	const Int iAdd = ( bUseHalfRoundingPoint ? 256 : (pcCU->getSlice()->getSliceType() == I_SLICE ? 171 : 85) ) << (iQBits - 9);
	3354
	3355	TCoeff transformedCoefficient;
	3356
	3357	// transform-skip
	3358	if (iTransformShift >= 0)
	3359	{
	3360	transformedCoefficient = resiDiff << iTransformShift;
	3361	}
	3362	else // for very high bit depths
	3363	{
	3364	const Int iTrShiftNeg = -iTransformShift;
	3365	const Int offset = 1 << (iTrShiftNeg - 1);
	3366	transformedCoefficient = ( resiDiff + offset ) >> iTrShiftNeg;
	3367	}
	3368
	3369	// quantization
	3370	const TCoeff iSign = (transformedCoefficient < 0 ? -1: 1);
	3371
	3372	const Int quantisationCoefficient = enableScalingLists ? piQuantCoeff[uiPos] : defaultQuantisationCoefficient;
	3373
	3374	const Int64 tmpLevel = (Int64)abs(transformedCoefficient) * quantisationCoefficient;
	3375
	3376	const TCoeff quantisedCoefficient = (TCoeff((tmpLevel + iAdd ) >> iQBits)) * iSign;
	3377
	3378	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
	3379	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
	3380	pcCoeff[ uiPos ] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
	3381	}
	3382
	3383
	3384	Void TComTrQuant::invTrSkipDeQuantOneSample( TComTU &rTu, ComponentID compID, TCoeff inSample, Pel &reconSample, const QpParam &cQP, UInt uiPos )
	3385	{
	3386	TComDataCU *pcCU = rTu.getCU();
	3387	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
	3388	const TComRectangle &rect = rTu.getRect(compID);
	3389	const UInt uiWidth = rect.width;
	3390	const UInt uiHeight = rect.height;
	3391	const Int QP_per = cQP.per;
	3392	const Int QP_rem = cQP.rem;
	3393	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
	3394	#if O0043_BEST_EFFORT_DECODING
	3395	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
	3396	#else
	3397	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
	3398	#endif
	3399	const Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
	3400	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
	3401	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
	3402	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
	3403
	3404	assert( scalingListType < SCALING_LIST_NUM );
	3405
	3406	const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
	3407
	3408	const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
	3409	const TCoeff transformMaximum = (1 << maxLog2TrDynamicRange) - 1;
	3410
	3411	// Dequantisation
	3412
	3413	TCoeff dequantisedSample;
	3414
	3415	if(enableScalingLists)
	3416	{
	3417	const UInt dequantCoefBits = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
	3418	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
	3419
	3420	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
	3421	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
	3422
	3423	Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
	3424
	3425	if(rightShift > 0)
	3426	{
	3427	const Intermediate_Int iAdd = 1 << (rightShift - 1);
	3428	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
	3429	const Intermediate_Int iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) + iAdd ) >> rightShift;
	3430
	3431	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	3432	}
	3433	else
	3434	{
	3435	const Int leftShift = -rightShift;
	3436	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
	3437	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) << leftShift;
	3438
	3439	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	3440	}
	3441	}
	3442	else
	3443	{
	3444	const Int scale = g_invQuantScales[QP_rem];
	3445	const Int scaleBits = (IQUANT_SHIFT + 1) ;
	3446
	3447	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
	3448	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
	3449	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
	3450
	3451	if (rightShift > 0)
	3452	{
	3453	const Intermediate_Int iAdd = 1 << (rightShift - 1);
	3454	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
	3455	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
	3456
	3457	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	3458	}
	3459	else
	3460	{
	3461	const Int leftShift = -rightShift;
	3462	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
	3463	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale) << leftShift;
	3464
	3465	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
	3466	}
	3467	}
	3468
	3469	// Inverse transform-skip
	3470
	3471	if (iTransformShift >= 0)
	3472	{
	3473	const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
	3474	reconSample = Pel(( dequantisedSample + offset ) >> iTransformShift);
	3475	}
	3476	else //for very high bit depths
	3477	{
	3478	const Int iTrShiftNeg = -iTransformShift;
	3479	reconSample = Pel(dequantisedSample << iTrShiftNeg);
	3480	}
	3481	}
	3482
	3483
	3484	Void TComTrQuant::crossComponentPrediction( TComTU & rTu,
	3485	const ComponentID compID,
	3486	const Pel * piResiL,
	3487	const Pel * piResiC,
	3488	Pel * piResiT,
	3489	const Int width,
	3490	const Int height,
	3491	const Int strideL,
	3492	const Int strideC,
	3493	const Int strideT,
	3494	const Bool reverse )
	3495	{
	3496	const Pel *pResiL = piResiL;
	3497	const Pel *pResiC = piResiC;
	3498	Pel *pResiT = piResiT;
	3499
	3500	TComDataCU *pCU = rTu.getCU();
	3501	const Int alpha = pCU->getCrossComponentPredictionAlpha( rTu.GetAbsPartIdxTU( compID ), compID );
	3502	const Int diffBitDepth = pCU->getSlice()->getSPS()->getDifferentialLumaChromaBitDepth();
	3503
	3504	for( Int y = 0; y < height; y++ )
	3505	{
	3506	if (reverse)
	3507	{
	3508	// A constraint is to be added to the HEVC Standard to limit the size of pResiL and pResiC at this point.
	3509	// The likely form of the constraint is to either restrict the values to CoeffMin to CoeffMax,
	3510	// or to be representable in a bitDepthY+4 or bitDepthC+4 signed integer.
	3511	// The result of the constraint is that for 8/10/12bit profiles, the input values
	3512	// can be represented within a 16-bit Pel-type.
	3513	#if RExt__HIGH_BIT_DEPTH_SUPPORT
	3514	for( Int x = 0; x < width; x++ )
	3515	{
	3516	pResiT[x] = pResiC[x] + (( alpha * rightShift( pResiL[x], diffBitDepth) ) >> 3);
	3517	}
	3518	#else
	3519	const Int minPel=std::numeric_limits<Pel>::min();
	3520	const Int maxPel=std::numeric_limits<Pel>::max();
	3521	for( Int x = 0; x < width; x++ )
	3522	{
	3523	pResiT[x] = Clip3<Int>(minPel, maxPel, pResiC[x] + (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3));
	3524	}
	3525	#endif
	3526	}
	3527	else
	3528	{
	3529	// Forward does not need clipping. Pel type should always be big enough.
	3530	for( Int x = 0; x < width; x++ )
	3531	{
	3532	pResiT[x] = pResiC[x] - (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3);
	3533	}
	3534	}
	3535
	3536	pResiL += strideL;
	3537	pResiC += strideC;
	3538	pResiT += strideT;
	3539	}
	3540	}
	3541
[56]	3542	//! \}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: