Context navigation

source: 3DVCSoftware/trunk/source/Lib/TLibCommon/TComTrQuant.cpp @ 1313

Visit:

Last change on this file since 1313 was 1313, checked in by tech, 9 years ago
Merged 14.1-update-dev1@1312.
Property svn:eol-style set to `native`
File size: 130.4 KB

Line
1	/* The copyright in this software is being made available under the BSD
2	* License, included below. This software may be subject to other third party
3	* and contributor rights, including patent rights, and no such rights are
4	* granted under this license.
5	*
6	* Copyright (c) 2010-2015, ITU/ISO/IEC
7	* All rights reserved.
8	*
9	* Redistribution and use in source and binary forms, with or without
10	* modification, are permitted provided that the following conditions are met:
11	*
12	* * Redistributions of source code must retain the above copyright notice,
13	* this list of conditions and the following disclaimer.
14	* * Redistributions in binary form must reproduce the above copyright notice,
15	* this list of conditions and the following disclaimer in the documentation
16	* and/or other materials provided with the distribution.
17	* * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
18	* be used to endorse or promote products derived from this software without
19	* specific prior written permission.
20	*
21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
25	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
31	* THE POSSIBILITY OF SUCH DAMAGE.
32	*/
33
34	/** \file TComTrQuant.cpp
35	\brief transform and quantization class
36	*/
37
38	#include <stdlib.h>
39	#include <math.h>
40	#include <limits>
41	#include <memory.h>
42	#include "TComTrQuant.h"
43	#include "TComPic.h"
44	#include "ContextTables.h"
45	#include "TComTU.h"
46	#include "Debug.h"
47
48	typedef struct
49	{
50	Int iNNZbeforePos0;
51	Double d64CodedLevelandDist; // distortion and level cost only
52	Double d64UncodedDist; // all zero coded block distortion
53	Double d64SigCost;
54	Double d64SigCost_0;
55	} coeffGroupRDStats;
56
57	//! \ingroup TLibCommon
58	//! \{
59
60	// ====================================================================================================================
61	// Constants
62	// ====================================================================================================================
63
64	#define RDOQ_CHROMA 1 ///< use of RDOQ in chroma
65
66
67	// ====================================================================================================================
68	// QpParam constructor
69	// ====================================================================================================================
70
71	QpParam::QpParam(const Int qpy,
72	const ChannelType chType,
73	const Int qpBdOffset,
74	const Int chromaQPOffset,
75	const ChromaFormat chFmt )
76	{
77	Int baseQp;
78
79	if(isLuma(chType))
80	{
81	baseQp = qpy + qpBdOffset;
82	}
83	else
84	{
85	baseQp = Clip3( -qpBdOffset, (chromaQPMappingTableSize - 1), qpy + chromaQPOffset );
86
87	if(baseQp < 0)
88	{
89	baseQp = baseQp + qpBdOffset;
90	}
91	else
92	{
93	baseQp = getScaledChromaQP(baseQp, chFmt) + qpBdOffset;
94	}
95	}
96
97	Qp =baseQp;
98	per=baseQp/6;
99	rem=baseQp%6;
100	}
101
102	QpParam::QpParam(const TComDataCU &cu, const ComponentID compID)
103	{
104	Int chromaQpOffset = 0;
105
106	if (isChroma(compID))
107	{
108	chromaQpOffset += cu.getSlice()->getPPS()->getQpOffset(compID);
109	chromaQpOffset += cu.getSlice()->getSliceChromaQpDelta(compID);
110
111	chromaQpOffset += cu.getSlice()->getPPS()->getPpsRangeExtension().getChromaQpOffsetListEntry(cu.getChromaQpAdj(0)).u.offset[Int(compID)-1];
112	}
113
114	*this = QpParam(cu.getQP( 0 ),
115	toChannelType(compID),
116	cu.getSlice()->getSPS()->getQpBDOffset(toChannelType(compID)),
117	chromaQpOffset,
118	cu.getPic()->getChromaFormat());
119	}
120
121
122	// ====================================================================================================================
123	// TComTrQuant class member functions
124	// ====================================================================================================================
125
126	TComTrQuant::TComTrQuant()
127	{
128	// allocate temporary buffers
129	m_plTempCoeff = new TCoeff[ MAX_CU_SIZE*MAX_CU_SIZE ];
130
131	// allocate bit estimation class (for RDOQ)
132	m_pcEstBitsSbac = new estBitsSbacStruct;
133	initScalingList();
134	}
135
136	TComTrQuant::~TComTrQuant()
137	{
138	// delete temporary buffers
139	if ( m_plTempCoeff )
140	{
141	delete [] m_plTempCoeff;
142	m_plTempCoeff = NULL;
143	}
144
145	// delete bit estimation class
146	if ( m_pcEstBitsSbac )
147	{
148	delete m_pcEstBitsSbac;
149	}
150	destroyScalingList();
151	}
152
153	#if ADAPTIVE_QP_SELECTION
154	Void TComTrQuant::storeSliceQpNext(TComSlice* pcSlice)
155	{
156	// NOTE: does this work with negative QPs or when some blocks are transquant-bypass enabled?
157
158	Int qpBase = pcSlice->getSliceQpBase();
159	Int sliceQpused = pcSlice->getSliceQp();
160	Int sliceQpnext;
161	Double alpha = qpBase < 17 ? 0.5 : 1;
162
163	Int cnt=0;
164	for(Int u=1; u<=LEVEL_RANGE; u++)
165	{
166	cnt += m_sliceNsamples[u] ;
167	}
168
169	if( !m_useRDOQ )
170	{
171	sliceQpused = qpBase;
172	alpha = 0.5;
173	}
174
175	if( cnt > 120 )
176	{
177	Double sum = 0;
178	Int k = 0;
179	for(Int u=1; u<LEVEL_RANGE; u++)
180	{
181	sum += u*m_sliceSumC[u];
182	k += uum_sliceNsamples[u];
183	}
184
185	Int v;
186	Double q[MAX_QP+1] ;
187	for(v=0; v<=MAX_QP; v++)
188	{
189	q[v] = (Double)(g_invQuantScales[v%6] * (1<<(v/6)))/64 ;
190	}
191
192	Double qnext = sum/k * q[sliceQpused] / (1<<ARL_C_PRECISION);
193
194	for(v=0; v<MAX_QP; v++)
195	{
196	if(qnext < alpha * q[v] + (1 - alpha) * q[v+1] )
197	{
198	break;
199	}
200	}
201	sliceQpnext = Clip3(sliceQpused - 3, sliceQpused + 3, v);
202	}
203	else
204	{
205	sliceQpnext = sliceQpused;
206	}
207
208	m_qpDelta[qpBase] = sliceQpnext - qpBase;
209	}
210
211	Void TComTrQuant::initSliceQpDelta()
212	{
213	for(Int qp=0; qp<=MAX_QP; qp++)
214	{
215	m_qpDelta[qp] = qp < 17 ? 0 : 1;
216	}
217	}
218
219	Void TComTrQuant::clearSliceARLCnt()
220	{
221	memset(m_sliceSumC, 0, sizeof(Double)*(LEVEL_RANGE+1));
222	memset(m_sliceNsamples, 0, sizeof(Int)*(LEVEL_RANGE+1));
223	}
224	#endif
225
226
227
228	#if MATRIX_MULT
229	/** NxN forward transform (2D) using brute force matrix multiplication (3 nested loops)
230	* \param block pointer to input data (residual)
231	* \param coeff pointer to output data (transform coefficients)
232	* \param uiStride stride of input data
233	* \param uiTrSize transform size (uiTrSize x uiTrSize)
234	* \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
235	*/
236	Void xTr(Int bitDepth, Pel block, TCoeff coeff, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
237	{
238	UInt i,j,k;
239	TCoeff iSum;
240	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
241	const TMatrixCoeff *iT;
242	UInt uiLog2TrSize = g_aucConvertToBit[ uiTrSize ] + 2;
243
244	if (uiTrSize==4)
245	{
246	iT = (useDST ? g_as_DST_MAT_4[TRANSFORM_FORWARD][0] : g_aiT4[TRANSFORM_FORWARD][0]);
247	}
248	else if (uiTrSize==8)
249	{
250	iT = g_aiT8[TRANSFORM_FORWARD][0];
251	}
252	else if (uiTrSize==16)
253	{
254	iT = g_aiT16[TRANSFORM_FORWARD][0];
255	}
256	else if (uiTrSize==32)
257	{
258	iT = g_aiT32[TRANSFORM_FORWARD][0];
259	}
260	else
261	{
262	assert(0);
263	}
264
265	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
266
267	const Int shift_1st = (uiLog2TrSize + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
268	const Int shift_2nd = uiLog2TrSize + TRANSFORM_MATRIX_SHIFT;
269	const Int add_1st = (shift_1st>0) ? (1<<(shift_1st-1)) : 0;
270	const Int add_2nd = 1<<(shift_2nd-1);
271
272	/* Horizontal transform */
273
274	for (i=0; i<uiTrSize; i++)
275	{
276	for (j=0; j<uiTrSize; j++)
277	{
278	iSum = 0;
279	for (k=0; k<uiTrSize; k++)
280	{
281	iSum += iT[iuiTrSize+k]block[j*uiStride+k];
282	}
283	tmp[i*uiTrSize+j] = (iSum + add_1st)>>shift_1st;
284	}
285	}
286
287	/* Vertical transform */
288	for (i=0; i<uiTrSize; i++)
289	{
290	for (j=0; j<uiTrSize; j++)
291	{
292	iSum = 0;
293	for (k=0; k<uiTrSize; k++)
294	{
295	iSum += iT[iuiTrSize+k]tmp[j*uiTrSize+k];
296	}
297	coeff[i*uiTrSize+j] = (iSum + add_2nd)>>shift_2nd;
298	}
299	}
300	}
301
302	/** NxN inverse transform (2D) using brute force matrix multiplication (3 nested loops)
303	* \param coeff pointer to input data (transform coefficients)
304	* \param block pointer to output data (residual)
305	* \param uiStride stride of output data
306	* \param uiTrSize transform size (uiTrSize x uiTrSize)
307	* \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
308	*/
309	Void xITr(Int bitDepth, TCoeff coeff, Pel block, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
310	{
311	UInt i,j,k;
312	TCoeff iSum;
313	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
314	const TMatrixCoeff *iT;
315
316	if (uiTrSize==4)
317	{
318	iT = (useDST ? g_as_DST_MAT_4[TRANSFORM_INVERSE][0] : g_aiT4[TRANSFORM_INVERSE][0]);
319	}
320	else if (uiTrSize==8)
321	{
322	iT = g_aiT8[TRANSFORM_INVERSE][0];
323	}
324	else if (uiTrSize==16)
325	{
326	iT = g_aiT16[TRANSFORM_INVERSE][0];
327	}
328	else if (uiTrSize==32)
329	{
330	iT = g_aiT32[TRANSFORM_INVERSE][0];
331	}
332	else
333	{
334	assert(0);
335	}
336
337	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
338
339	const Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
340	const Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
341	const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
342	const TCoeff clipMaximum = (1 << maxLog2TrDynamicRange) - 1;
343	assert(shift_2nd>=0);
344	const Int add_1st = 1<<(shift_1st-1);
345	const Int add_2nd = (shift_2nd>0) ? (1<<(shift_2nd-1)) : 0;
346
347	/* Horizontal transform */
348	for (i=0; i<uiTrSize; i++)
349	{
350	for (j=0; j<uiTrSize; j++)
351	{
352	iSum = 0;
353	for (k=0; k<uiTrSize; k++)
354	{
355	iSum += iT[kuiTrSize+i]coeff[k*uiTrSize+j];
356	}
357
358	// Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
359	tmp[i*uiTrSize+j] = Clip3<TCoeff>(clipMinimum, clipMaximum, (iSum + add_1st)>>shift_1st);
360	}
361	}
362
363	/* Vertical transform */
364	for (i=0; i<uiTrSize; i++)
365	{
366	for (j=0; j<uiTrSize; j++)
367	{
368	iSum = 0;
369	for (k=0; k<uiTrSize; k++)
370	{
371	iSum += iT[kuiTrSize+j]tmp[i*uiTrSize+k];
372	}
373
374	block[i*uiStride+j] = Clip3<TCoeff>(std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max(), (iSum + add_2nd)>>shift_2nd);
375	}
376	}
377	}
378
379	#endif //MATRIX_MULT
380
381
382	/** 4x4 forward transform implemented using partial butterfly structure (1D)
383	* \param src input data (residual)
384	* \param dst output data (transform coefficients)
385	* \param shift specifies right shift after 1D transform
386	* \param line
387	*/
388	Void partialButterfly4(TCoeff src, TCoeff dst, Int shift, Int line)
389	{
390	Int j;
391	TCoeff E[2],O[2];
392	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
393
394	for (j=0; j<line; j++)
395	{
396	/* E and O */
397	E[0] = src[0] + src[3];
398	O[0] = src[0] - src[3];
399	E[1] = src[1] + src[2];
400	O[1] = src[1] - src[2];
401
402	dst[0] = (g_aiT4[TRANSFORM_FORWARD][0][0]E[0] + g_aiT4[TRANSFORM_FORWARD][0][1]E[1] + add)>>shift;
403	dst[2line] = (g_aiT4[TRANSFORM_FORWARD][2][0]E[0] + g_aiT4[TRANSFORM_FORWARD][2][1]*E[1] + add)>>shift;
404	dst[line] = (g_aiT4[TRANSFORM_FORWARD][1][0]O[0] + g_aiT4[TRANSFORM_FORWARD][1][1]O[1] + add)>>shift;
405	dst[3line] = (g_aiT4[TRANSFORM_FORWARD][3][0]O[0] + g_aiT4[TRANSFORM_FORWARD][3][1]*O[1] + add)>>shift;
406
407	src += 4;
408	dst ++;
409	}
410	}
411
412	// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
413	// give identical results
414	Void fastForwardDst(TCoeff block, TCoeff coeff, Int shift) // input block, output coeff
415	{
416	Int i;
417	TCoeff c[4];
418	TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
419	for (i=0; i<4; i++)
420	{
421	// Intermediate Variables
422	c[0] = block[4*i+0];
423	c[1] = block[4*i+1];
424	c[2] = block[4*i+2];
425	c[3] = block[4*i+3];
426
427	for (Int row = 0; row < 4; row++)
428	{
429	TCoeff result = 0;
430	for (Int column = 0; column < 4; column++)
431	{
432	result += c[column] * g_as_DST_MAT_4[TRANSFORM_FORWARD][row][column]; // use the defined matrix, rather than hard-wired numbers
433	}
434
435	coeff[(row * 4) + i] = rightShift((result + rnd_factor), shift);
436	}
437	}
438	}
439
440	Void fastInverseDst(TCoeff tmp, TCoeff block, Int shift, const TCoeff outputMinimum, const TCoeff outputMaximum) // input tmp, output block
441	{
442	Int i;
443	TCoeff c[4];
444	TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
445	for (i=0; i<4; i++)
446	{
447	// Intermediate Variables
448	c[0] = tmp[ i];
449	c[1] = tmp[4 +i];
450	c[2] = tmp[8 +i];
451	c[3] = tmp[12+i];
452
453	for (Int column = 0; column < 4; column++)
454	{
455	TCoeff &result = block[(i * 4) + column];
456
457	result = 0;
458	for (Int row = 0; row < 4; row++)
459	{
460	result += c[row] * g_as_DST_MAT_4[TRANSFORM_INVERSE][row][column]; // use the defined matrix, rather than hard-wired numbers
461	}
462
463	result = Clip3( outputMinimum, outputMaximum, rightShift((result + rnd_factor), shift));
464	}
465	}
466	}
467
468	/** 4x4 inverse transform implemented using partial butterfly structure (1D)
469	* \param src input data (transform coefficients)
470	* \param dst output data (residual)
471	* \param shift specifies right shift after 1D transform
472	* \param line
473	* \param outputMinimum minimum for clipping
474	* \param outputMaximum maximum for clipping
475	*/
476	Void partialButterflyInverse4(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
477	{
478	Int j;
479	TCoeff E[2],O[2];
480	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
481
482	for (j=0; j<line; j++)
483	{
484	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
485	O[0] = g_aiT4[TRANSFORM_INVERSE][1][0]src[line] + g_aiT4[TRANSFORM_INVERSE][3][0]src[3*line];
486	O[1] = g_aiT4[TRANSFORM_INVERSE][1][1]src[line] + g_aiT4[TRANSFORM_INVERSE][3][1]src[3*line];
487	E[0] = g_aiT4[TRANSFORM_INVERSE][0][0]src[0] + g_aiT4[TRANSFORM_INVERSE][2][0]src[2*line];
488	E[1] = g_aiT4[TRANSFORM_INVERSE][0][1]src[0] + g_aiT4[TRANSFORM_INVERSE][2][1]src[2*line];
489
490	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
491	dst[0] = Clip3( outputMinimum, outputMaximum, (E[0] + O[0] + add)>>shift );
492	dst[1] = Clip3( outputMinimum, outputMaximum, (E[1] + O[1] + add)>>shift );
493	dst[2] = Clip3( outputMinimum, outputMaximum, (E[1] - O[1] + add)>>shift );
494	dst[3] = Clip3( outputMinimum, outputMaximum, (E[0] - O[0] + add)>>shift );
495
496	src ++;
497	dst += 4;
498	}
499	}
500
501	/** 8x8 forward transform implemented using partial butterfly structure (1D)
502	* \param src input data (residual)
503	* \param dst output data (transform coefficients)
504	* \param shift specifies right shift after 1D transform
505	* \param line
506	*/
507	Void partialButterfly8(TCoeff src, TCoeff dst, Int shift, Int line)
508	{
509	Int j,k;
510	TCoeff E[4],O[4];
511	TCoeff EE[2],EO[2];
512	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
513
514	for (j=0; j<line; j++)
515	{
516	/* E and O*/
517	for (k=0;k<4;k++)
518	{
519	E[k] = src[k] + src[7-k];
520	O[k] = src[k] - src[7-k];
521	}
522	/* EE and EO */
523	EE[0] = E[0] + E[3];
524	EO[0] = E[0] - E[3];
525	EE[1] = E[1] + E[2];
526	EO[1] = E[1] - E[2];
527
528	dst[0] = (g_aiT8[TRANSFORM_FORWARD][0][0]EE[0] + g_aiT8[TRANSFORM_FORWARD][0][1]EE[1] + add)>>shift;
529	dst[4line] = (g_aiT8[TRANSFORM_FORWARD][4][0]EE[0] + g_aiT8[TRANSFORM_FORWARD][4][1]*EE[1] + add)>>shift;
530	dst[2line] = (g_aiT8[TRANSFORM_FORWARD][2][0]EO[0] + g_aiT8[TRANSFORM_FORWARD][2][1]*EO[1] + add)>>shift;
531	dst[6line] = (g_aiT8[TRANSFORM_FORWARD][6][0]EO[0] + g_aiT8[TRANSFORM_FORWARD][6][1]*EO[1] + add)>>shift;
532
533	dst[line] = (g_aiT8[TRANSFORM_FORWARD][1][0]O[0] + g_aiT8[TRANSFORM_FORWARD][1][1]O[1] + g_aiT8[TRANSFORM_FORWARD][1][2]O[2] + g_aiT8[TRANSFORM_FORWARD][1][3]O[3] + add)>>shift;
534	dst[3line] = (g_aiT8[TRANSFORM_FORWARD][3][0]O[0] + g_aiT8[TRANSFORM_FORWARD][3][1]O[1] + g_aiT8[TRANSFORM_FORWARD][3][2]O[2] + g_aiT8[TRANSFORM_FORWARD][3][3]*O[3] + add)>>shift;
535	dst[5line] = (g_aiT8[TRANSFORM_FORWARD][5][0]O[0] + g_aiT8[TRANSFORM_FORWARD][5][1]O[1] + g_aiT8[TRANSFORM_FORWARD][5][2]O[2] + g_aiT8[TRANSFORM_FORWARD][5][3]*O[3] + add)>>shift;
536	dst[7line] = (g_aiT8[TRANSFORM_FORWARD][7][0]O[0] + g_aiT8[TRANSFORM_FORWARD][7][1]O[1] + g_aiT8[TRANSFORM_FORWARD][7][2]O[2] + g_aiT8[TRANSFORM_FORWARD][7][3]*O[3] + add)>>shift;
537
538	src += 8;
539	dst ++;
540	}
541	}
542
543	/** 8x8 inverse transform implemented using partial butterfly structure (1D)
544	* \param src input data (transform coefficients)
545	* \param dst output data (residual)
546	* \param shift specifies right shift after 1D transform
547	* \param line
548	* \param outputMinimum minimum for clipping
549	* \param outputMaximum maximum for clipping
550	*/
551	Void partialButterflyInverse8(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
552	{
553	Int j,k;
554	TCoeff E[4],O[4];
555	TCoeff EE[2],EO[2];
556	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
557
558	for (j=0; j<line; j++)
559	{
560	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
561	for (k=0;k<4;k++)
562	{
563	O[k] = g_aiT8[TRANSFORM_INVERSE][ 1][k]src[line] + g_aiT8[TRANSFORM_INVERSE][ 3][k]src[3*line] +
564	g_aiT8[TRANSFORM_INVERSE][ 5][k]src[5line] + g_aiT8[TRANSFORM_INVERSE][ 7][k]src[7line];
565	}
566
567	EO[0] = g_aiT8[TRANSFORM_INVERSE][2][0]src[ 2line ] + g_aiT8[TRANSFORM_INVERSE][6][0]src[ 6line ];
568	EO[1] = g_aiT8[TRANSFORM_INVERSE][2][1]src[ 2line ] + g_aiT8[TRANSFORM_INVERSE][6][1]src[ 6line ];
569	EE[0] = g_aiT8[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT8[TRANSFORM_INVERSE][4][0]src[ 4*line ];
570	EE[1] = g_aiT8[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT8[TRANSFORM_INVERSE][4][1]src[ 4*line ];
571
572	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
573	E[0] = EE[0] + EO[0];
574	E[3] = EE[0] - EO[0];
575	E[1] = EE[1] + EO[1];
576	E[2] = EE[1] - EO[1];
577	for (k=0;k<4;k++)
578	{
579	dst[ k ] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
580	dst[ k+4 ] = Clip3( outputMinimum, outputMaximum, (E[3-k] - O[3-k] + add)>>shift );
581	}
582	src ++;
583	dst += 8;
584	}
585	}
586
587	/** 16x16 forward transform implemented using partial butterfly structure (1D)
588	* \param src input data (residual)
589	* \param dst output data (transform coefficients)
590	* \param shift specifies right shift after 1D transform
591	* \param line
592	*/
593	Void partialButterfly16(TCoeff src, TCoeff dst, Int shift, Int line)
594	{
595	Int j,k;
596	TCoeff E[8],O[8];
597	TCoeff EE[4],EO[4];
598	TCoeff EEE[2],EEO[2];
599	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
600
601	for (j=0; j<line; j++)
602	{
603	/* E and O*/
604	for (k=0;k<8;k++)
605	{
606	E[k] = src[k] + src[15-k];
607	O[k] = src[k] - src[15-k];
608	}
609	/* EE and EO */
610	for (k=0;k<4;k++)
611	{
612	EE[k] = E[k] + E[7-k];
613	EO[k] = E[k] - E[7-k];
614	}
615	/* EEE and EEO */
616	EEE[0] = EE[0] + EE[3];
617	EEO[0] = EE[0] - EE[3];
618	EEE[1] = EE[1] + EE[2];
619	EEO[1] = EE[1] - EE[2];
620
621	dst[ 0 ] = (g_aiT16[TRANSFORM_FORWARD][ 0][0]EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 0][1]EEE[1] + add)>>shift;
622	dst[ 8line ] = (g_aiT16[TRANSFORM_FORWARD][ 8][0]EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 8][1]*EEE[1] + add)>>shift;
623	dst[ 4line ] = (g_aiT16[TRANSFORM_FORWARD][ 4][0]EEO[0] + g_aiT16[TRANSFORM_FORWARD][ 4][1]*EEO[1] + add)>>shift;
624	dst[ 12line] = (g_aiT16[TRANSFORM_FORWARD][12][0]EEO[0] + g_aiT16[TRANSFORM_FORWARD][12][1]*EEO[1] + add)>>shift;
625
626	for (k=2;k<16;k+=4)
627	{
628	dst[ kline ] = (g_aiT16[TRANSFORM_FORWARD][k][0]EO[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*EO[1] +
629	g_aiT16[TRANSFORM_FORWARD][k][2]EO[2] + g_aiT16[TRANSFORM_FORWARD][k][3]EO[3] + add)>>shift;
630	}
631
632	for (k=1;k<16;k+=2)
633	{
634	dst[ kline ] = (g_aiT16[TRANSFORM_FORWARD][k][0]O[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*O[1] +
635	g_aiT16[TRANSFORM_FORWARD][k][2]O[2] + g_aiT16[TRANSFORM_FORWARD][k][3]O[3] +
636	g_aiT16[TRANSFORM_FORWARD][k][4]O[4] + g_aiT16[TRANSFORM_FORWARD][k][5]O[5] +
637	g_aiT16[TRANSFORM_FORWARD][k][6]O[6] + g_aiT16[TRANSFORM_FORWARD][k][7]O[7] + add)>>shift;
638	}
639
640	src += 16;
641	dst ++;
642
643	}
644	}
645
646	/** 16x16 inverse transform implemented using partial butterfly structure (1D)
647	* \param src input data (transform coefficients)
648	* \param dst output data (residual)
649	* \param shift specifies right shift after 1D transform
650	* \param line
651	* \param outputMinimum minimum for clipping
652	* \param outputMaximum maximum for clipping
653	*/
654	Void partialButterflyInverse16(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
655	{
656	Int j,k;
657	TCoeff E[8],O[8];
658	TCoeff EE[4],EO[4];
659	TCoeff EEE[2],EEO[2];
660	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
661
662	for (j=0; j<line; j++)
663	{
664	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
665	for (k=0;k<8;k++)
666	{
667	O[k] = g_aiT16[TRANSFORM_INVERSE][ 1][k]src[ line] + g_aiT16[TRANSFORM_INVERSE][ 3][k]src[ 3*line] +
668	g_aiT16[TRANSFORM_INVERSE][ 5][k]src[ 5line] + g_aiT16[TRANSFORM_INVERSE][ 7][k]src[ 7line] +
669	g_aiT16[TRANSFORM_INVERSE][ 9][k]src[ 9line] + g_aiT16[TRANSFORM_INVERSE][11][k]src[11line] +
670	g_aiT16[TRANSFORM_INVERSE][13][k]src[13line] + g_aiT16[TRANSFORM_INVERSE][15][k]src[15line];
671	}
672	for (k=0;k<4;k++)
673	{
674	EO[k] = g_aiT16[TRANSFORM_INVERSE][ 2][k]src[ 2line] + g_aiT16[TRANSFORM_INVERSE][ 6][k]src[ 6line] +
675	g_aiT16[TRANSFORM_INVERSE][10][k]src[10line] + g_aiT16[TRANSFORM_INVERSE][14][k]src[14line];
676	}
677	EEO[0] = g_aiT16[TRANSFORM_INVERSE][4][0]src[ 4line ] + g_aiT16[TRANSFORM_INVERSE][12][0]src[ 12line ];
678	EEE[0] = g_aiT16[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT16[TRANSFORM_INVERSE][ 8][0]src[ 8*line ];
679	EEO[1] = g_aiT16[TRANSFORM_INVERSE][4][1]src[ 4line ] + g_aiT16[TRANSFORM_INVERSE][12][1]src[ 12line ];
680	EEE[1] = g_aiT16[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT16[TRANSFORM_INVERSE][ 8][1]src[ 8*line ];
681
682	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
683	for (k=0;k<2;k++)
684	{
685	EE[k] = EEE[k] + EEO[k];
686	EE[k+2] = EEE[1-k] - EEO[1-k];
687	}
688	for (k=0;k<4;k++)
689	{
690	E[k] = EE[k] + EO[k];
691	E[k+4] = EE[3-k] - EO[3-k];
692	}
693	for (k=0;k<8;k++)
694	{
695	dst[k] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
696	dst[k+8] = Clip3( outputMinimum, outputMaximum, (E[7-k] - O[7-k] + add)>>shift );
697	}
698	src ++;
699	dst += 16;
700	}
701	}
702
703	/** 32x32 forward transform implemented using partial butterfly structure (1D)
704	* \param src input data (residual)
705	* \param dst output data (transform coefficients)
706	* \param shift specifies right shift after 1D transform
707	* \param line
708	*/
709	Void partialButterfly32(TCoeff src, TCoeff dst, Int shift, Int line)
710	{
711	Int j,k;
712	TCoeff E[16],O[16];
713	TCoeff EE[8],EO[8];
714	TCoeff EEE[4],EEO[4];
715	TCoeff EEEE[2],EEEO[2];
716	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
717
718	for (j=0; j<line; j++)
719	{
720	/* E and O*/
721	for (k=0;k<16;k++)
722	{
723	E[k] = src[k] + src[31-k];
724	O[k] = src[k] - src[31-k];
725	}
726	/* EE and EO */
727	for (k=0;k<8;k++)
728	{
729	EE[k] = E[k] + E[15-k];
730	EO[k] = E[k] - E[15-k];
731	}
732	/* EEE and EEO */
733	for (k=0;k<4;k++)
734	{
735	EEE[k] = EE[k] + EE[7-k];
736	EEO[k] = EE[k] - EE[7-k];
737	}
738	/* EEEE and EEEO */
739	EEEE[0] = EEE[0] + EEE[3];
740	EEEO[0] = EEE[0] - EEE[3];
741	EEEE[1] = EEE[1] + EEE[2];
742	EEEO[1] = EEE[1] - EEE[2];
743
744	dst[ 0 ] = (g_aiT32[TRANSFORM_FORWARD][ 0][0]EEEE[0] + g_aiT32[TRANSFORM_FORWARD][ 0][1]EEEE[1] + add)>>shift;
745	dst[ 16line ] = (g_aiT32[TRANSFORM_FORWARD][16][0]EEEE[0] + g_aiT32[TRANSFORM_FORWARD][16][1]*EEEE[1] + add)>>shift;
746	dst[ 8line ] = (g_aiT32[TRANSFORM_FORWARD][ 8][0]EEEO[0] + g_aiT32[TRANSFORM_FORWARD][ 8][1]*EEEO[1] + add)>>shift;
747	dst[ 24line ] = (g_aiT32[TRANSFORM_FORWARD][24][0]EEEO[0] + g_aiT32[TRANSFORM_FORWARD][24][1]*EEEO[1] + add)>>shift;
748	for (k=4;k<32;k+=8)
749	{
750	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][0]EEO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EEO[1] +
751	g_aiT32[TRANSFORM_FORWARD][k][2]EEO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]EEO[3] + add)>>shift;
752	}
753	for (k=2;k<32;k+=4)
754	{
755	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][0]EO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EO[1] +
756	g_aiT32[TRANSFORM_FORWARD][k][2]EO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]EO[3] +
757	g_aiT32[TRANSFORM_FORWARD][k][4]EO[4] + g_aiT32[TRANSFORM_FORWARD][k][5]EO[5] +
758	g_aiT32[TRANSFORM_FORWARD][k][6]EO[6] + g_aiT32[TRANSFORM_FORWARD][k][7]EO[7] + add)>>shift;
759	}
760	for (k=1;k<32;k+=2)
761	{
762	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][ 0]O[ 0] + g_aiT32[TRANSFORM_FORWARD][k][ 1]*O[ 1] +
763	g_aiT32[TRANSFORM_FORWARD][k][ 2]O[ 2] + g_aiT32[TRANSFORM_FORWARD][k][ 3]O[ 3] +
764	g_aiT32[TRANSFORM_FORWARD][k][ 4]O[ 4] + g_aiT32[TRANSFORM_FORWARD][k][ 5]O[ 5] +
765	g_aiT32[TRANSFORM_FORWARD][k][ 6]O[ 6] + g_aiT32[TRANSFORM_FORWARD][k][ 7]O[ 7] +
766	g_aiT32[TRANSFORM_FORWARD][k][ 8]O[ 8] + g_aiT32[TRANSFORM_FORWARD][k][ 9]O[ 9] +
767	g_aiT32[TRANSFORM_FORWARD][k][10]O[10] + g_aiT32[TRANSFORM_FORWARD][k][11]O[11] +
768	g_aiT32[TRANSFORM_FORWARD][k][12]O[12] + g_aiT32[TRANSFORM_FORWARD][k][13]O[13] +
769	g_aiT32[TRANSFORM_FORWARD][k][14]O[14] + g_aiT32[TRANSFORM_FORWARD][k][15]O[15] + add)>>shift;
770	}
771
772	src += 32;
773	dst ++;
774	}
775	}
776
777	/** 32x32 inverse transform implemented using partial butterfly structure (1D)
778	* \param src input data (transform coefficients)
779	* \param dst output data (residual)
780	* \param shift specifies right shift after 1D transform
781	* \param line
782	* \param outputMinimum minimum for clipping
783	* \param outputMaximum maximum for clipping
784	*/
785	Void partialButterflyInverse32(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
786	{
787	Int j,k;
788	TCoeff E[16],O[16];
789	TCoeff EE[8],EO[8];
790	TCoeff EEE[4],EEO[4];
791	TCoeff EEEE[2],EEEO[2];
792	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
793
794	for (j=0; j<line; j++)
795	{
796	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
797	for (k=0;k<16;k++)
798	{
799	O[k] = g_aiT32[TRANSFORM_INVERSE][ 1][k]src[ line ] + g_aiT32[TRANSFORM_INVERSE][ 3][k]src[ 3*line ] +
800	g_aiT32[TRANSFORM_INVERSE][ 5][k]src[ 5line ] + g_aiT32[TRANSFORM_INVERSE][ 7][k]src[ 7line ] +
801	g_aiT32[TRANSFORM_INVERSE][ 9][k]src[ 9line ] + g_aiT32[TRANSFORM_INVERSE][11][k]src[ 11line ] +
802	g_aiT32[TRANSFORM_INVERSE][13][k]src[ 13line ] + g_aiT32[TRANSFORM_INVERSE][15][k]src[ 15line ] +
803	g_aiT32[TRANSFORM_INVERSE][17][k]src[ 17line ] + g_aiT32[TRANSFORM_INVERSE][19][k]src[ 19line ] +
804	g_aiT32[TRANSFORM_INVERSE][21][k]src[ 21line ] + g_aiT32[TRANSFORM_INVERSE][23][k]src[ 23line ] +
805	g_aiT32[TRANSFORM_INVERSE][25][k]src[ 25line ] + g_aiT32[TRANSFORM_INVERSE][27][k]src[ 27line ] +
806	g_aiT32[TRANSFORM_INVERSE][29][k]src[ 29line ] + g_aiT32[TRANSFORM_INVERSE][31][k]src[ 31line ];
807	}
808	for (k=0;k<8;k++)
809	{
810	EO[k] = g_aiT32[TRANSFORM_INVERSE][ 2][k]src[ 2line ] + g_aiT32[TRANSFORM_INVERSE][ 6][k]src[ 6line ] +
811	g_aiT32[TRANSFORM_INVERSE][10][k]src[ 10line ] + g_aiT32[TRANSFORM_INVERSE][14][k]src[ 14line ] +
812	g_aiT32[TRANSFORM_INVERSE][18][k]src[ 18line ] + g_aiT32[TRANSFORM_INVERSE][22][k]src[ 22line ] +
813	g_aiT32[TRANSFORM_INVERSE][26][k]src[ 26line ] + g_aiT32[TRANSFORM_INVERSE][30][k]src[ 30line ];
814	}
815	for (k=0;k<4;k++)
816	{
817	EEO[k] = g_aiT32[TRANSFORM_INVERSE][ 4][k]src[ 4line ] + g_aiT32[TRANSFORM_INVERSE][12][k]src[ 12line ] +
818	g_aiT32[TRANSFORM_INVERSE][20][k]src[ 20line ] + g_aiT32[TRANSFORM_INVERSE][28][k]src[ 28line ];
819	}
820	EEEO[0] = g_aiT32[TRANSFORM_INVERSE][8][0]src[ 8line ] + g_aiT32[TRANSFORM_INVERSE][24][0]src[ 24line ];
821	EEEO[1] = g_aiT32[TRANSFORM_INVERSE][8][1]src[ 8line ] + g_aiT32[TRANSFORM_INVERSE][24][1]src[ 24line ];
822	EEEE[0] = g_aiT32[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT32[TRANSFORM_INVERSE][16][0]src[ 16*line ];
823	EEEE[1] = g_aiT32[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT32[TRANSFORM_INVERSE][16][1]src[ 16*line ];
824
825	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
826	EEE[0] = EEEE[0] + EEEO[0];
827	EEE[3] = EEEE[0] - EEEO[0];
828	EEE[1] = EEEE[1] + EEEO[1];
829	EEE[2] = EEEE[1] - EEEO[1];
830	for (k=0;k<4;k++)
831	{
832	EE[k] = EEE[k] + EEO[k];
833	EE[k+4] = EEE[3-k] - EEO[3-k];
834	}
835	for (k=0;k<8;k++)
836	{
837	E[k] = EE[k] + EO[k];
838	E[k+8] = EE[7-k] - EO[7-k];
839	}
840	for (k=0;k<16;k++)
841	{
842	dst[k] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
843	dst[k+16] = Clip3( outputMinimum, outputMaximum, (E[15-k] - O[15-k] + add)>>shift );
844	}
845	src ++;
846	dst += 32;
847	}
848	}
849
850	/** MxN forward transform (2D)
851	* \param bitDepth [in] bit depth
852	* \param block [in] residual block
853	* \param coeff [out] transform coefficients
854	* \param iWidth [in] width of transform
855	* \param iHeight [in] height of transform
856	* \param useDST [in]
857	* \param maxLog2TrDynamicRange [in]
858
859	*/
860	Void xTrMxN(Int bitDepth, TCoeff block, TCoeff coeff, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
861	{
862	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
863
864	const Int shift_1st = ((g_aucConvertToBit[iWidth] + 2) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
865	const Int shift_2nd = (g_aucConvertToBit[iHeight] + 2) + TRANSFORM_MATRIX_SHIFT;
866
867	assert(shift_1st >= 0);
868	assert(shift_2nd >= 0);
869
870	TCoeff tmp[ MAX_TU_SIZE * MAX_TU_SIZE ];
871
872	switch (iWidth)
873	{
874	case 4:
875	{
876	if ((iHeight == 4) && useDST) // Check for DCT or DST
877	{
878	fastForwardDst( block, tmp, shift_1st );
879	}
880	else
881	{
882	partialButterfly4 ( block, tmp, shift_1st, iHeight );
883	}
884	}
885	break;
886
887	case 8: partialButterfly8 ( block, tmp, shift_1st, iHeight ); break;
888	case 16: partialButterfly16( block, tmp, shift_1st, iHeight ); break;
889	case 32: partialButterfly32( block, tmp, shift_1st, iHeight ); break;
890	default:
891	assert(0); exit (1); break;
892	}
893
894	switch (iHeight)
895	{
896	case 4:
897	{
898	if ((iWidth == 4) && useDST) // Check for DCT or DST
899	{
900	fastForwardDst( tmp, coeff, shift_2nd );
901	}
902	else
903	{
904	partialButterfly4 ( tmp, coeff, shift_2nd, iWidth );
905	}
906	}
907	break;
908
909	case 8: partialButterfly8 ( tmp, coeff, shift_2nd, iWidth ); break;
910	case 16: partialButterfly16( tmp, coeff, shift_2nd, iWidth ); break;
911	case 32: partialButterfly32( tmp, coeff, shift_2nd, iWidth ); break;
912	default:
913	assert(0); exit (1); break;
914	}
915	}
916
917
918	/** MxN inverse transform (2D)
919	* \param bitDepth [in] bit depth
920	* \param coeff [in] transform coefficients
921	* \param block [out] residual block
922	* \param iWidth [in] width of transform
923	* \param iHeight [in] height of transform
924	* \param useDST [in]
925	* \param maxLog2TrDynamicRange [in]
926	*/
927	Void xITrMxN(Int bitDepth, TCoeff coeff, TCoeff block, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
928	{
929	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
930
931	Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
932	Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
933	const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
934	const TCoeff clipMaximum = (1 << maxLog2TrDynamicRange) - 1;
935
936	assert(shift_1st >= 0);
937	assert(shift_2nd >= 0);
938
939	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
940
941	switch (iHeight)
942	{
943	case 4:
944	{
945	if ((iWidth == 4) && useDST) // Check for DCT or DST
946	{
947	fastInverseDst( coeff, tmp, shift_1st, clipMinimum, clipMaximum);
948	}
949	else
950	{
951	partialButterflyInverse4 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum);
952	}
953	}
954	break;
955
956	case 8: partialButterflyInverse8 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
957	case 16: partialButterflyInverse16( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
958	case 32: partialButterflyInverse32( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
959
960	default:
961	assert(0); exit (1); break;
962	}
963
964	switch (iWidth)
965	{
966	// Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
967	case 4:
968	{
969	if ((iHeight == 4) && useDST) // Check for DCT or DST
970	{
971	fastInverseDst( tmp, block, shift_2nd, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max() );
972	}
973	else
974	{
975	partialButterflyInverse4 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max());
976	}
977	}
978	break;
979
980	case 8: partialButterflyInverse8 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
981	case 16: partialButterflyInverse16( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
982	case 32: partialButterflyInverse32( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
983
984	default:
985	assert(0); exit (1); break;
986	}
987	}
988
989
990	// To minimize the distortion only. No rate is considered.
991	Void TComTrQuant::signBitHidingHDQ( TCoeff* pQCoef, TCoeff* pCoef, TCoeff* deltaU, const TUEntropyCodingParameters &codingParameters, const Int maxLog2TrDynamicRange )
992	{
993	const UInt width = codingParameters.widthInGroups << MLS_CG_LOG2_WIDTH;
994	const UInt height = codingParameters.heightInGroups << MLS_CG_LOG2_HEIGHT;
995	const UInt groupSize = 1 << MLS_CG_SIZE;
996
997	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
998	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
999
1000	Int lastCG = -1;
1001	Int absSum = 0 ;
1002	Int n ;
1003
1004	for( Int subSet = (width*height-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
1005	{
1006	Int subPos = subSet << MLS_CG_SIZE;
1007	Int firstNZPosInCG=groupSize , lastNZPosInCG=-1 ;
1008	absSum = 0 ;
1009
1010	for(n = groupSize-1; n >= 0; --n )
1011	{
1012	if( pQCoef[ codingParameters.scan[ n + subPos ]] )
1013	{
1014	lastNZPosInCG = n;
1015	break;
1016	}
1017	}
1018
1019	for(n = 0; n <groupSize; n++ )
1020	{
1021	if( pQCoef[ codingParameters.scan[ n + subPos ]] )
1022	{
1023	firstNZPosInCG = n;
1024	break;
1025	}
1026	}
1027
1028	for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
1029	{
1030	absSum += Int(pQCoef[ codingParameters.scan[ n + subPos ]]);
1031	}
1032
1033	if(lastNZPosInCG>=0 && lastCG==-1)
1034	{
1035	lastCG = 1 ;
1036	}
1037
1038	if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
1039	{
1040	UInt signbit = (pQCoef[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1) ;
1041	if( signbit!=(absSum&0x1) ) //compare signbit with sum_parity
1042	{
1043	TCoeff curCost = std::numeric_limits<TCoeff>::max();
1044	TCoeff minCostInc = std::numeric_limits<TCoeff>::max();
1045	Int minPos =-1, finalChange=0, curChange=0;
1046
1047	for( n = (lastCG==1?lastNZPosInCG:groupSize-1) ; n >= 0; --n )
1048	{
1049	UInt blkPos = codingParameters.scan[ n+subPos ];
1050	if(pQCoef[ blkPos ] != 0 )
1051	{
1052	if(deltaU[blkPos]>0)
1053	{
1054	curCost = - deltaU[blkPos];
1055	curChange=1 ;
1056	}
1057	else
1058	{
1059	//curChange =-1;
1060	if(n==firstNZPosInCG && abs(pQCoef[blkPos])==1)
1061	{
1062	curCost = std::numeric_limits<TCoeff>::max();
1063	}
1064	else
1065	{
1066	curCost = deltaU[blkPos];
1067	curChange =-1;
1068	}
1069	}
1070	}
1071	else
1072	{
1073	if(n<firstNZPosInCG)
1074	{
1075	UInt thisSignBit = (pCoef[blkPos]>=0?0:1);
1076	if(thisSignBit != signbit )
1077	{
1078	curCost = std::numeric_limits<TCoeff>::max();
1079	}
1080	else
1081	{
1082	curCost = - (deltaU[blkPos]) ;
1083	curChange = 1 ;
1084	}
1085	}
1086	else
1087	{
1088	curCost = - (deltaU[blkPos]) ;
1089	curChange = 1 ;
1090	}
1091	}
1092
1093	if( curCost<minCostInc)
1094	{
1095	minCostInc = curCost ;
1096	finalChange = curChange ;
1097	minPos = blkPos ;
1098	}
1099	} //CG loop
1100
1101	if(pQCoef[minPos] == entropyCodingMaximum \|\| pQCoef[minPos] == entropyCodingMinimum)
1102	{
1103	finalChange = -1;
1104	}
1105
1106	if(pCoef[minPos]>=0)
1107	{
1108	pQCoef[minPos] += finalChange ;
1109	}
1110	else
1111	{
1112	pQCoef[minPos] -= finalChange ;
1113	}
1114	} // Hide
1115	}
1116	if(lastCG==1)
1117	{
1118	lastCG=0 ;
1119	}
1120	} // TU loop
1121
1122	return;
1123	}
1124
1125
1126	Void TComTrQuant::xQuant( TComTU &rTu,
1127	TCoeff * pSrc,
1128	TCoeff * pDes,
1129	#if ADAPTIVE_QP_SELECTION
1130	TCoeff *pArlDes,
1131	#endif
1132	TCoeff &uiAbsSum,
1133	const ComponentID compID,
1134	const QpParam &cQP )
1135	{
1136	const TComRectangle &rect = rTu.getRect(compID);
1137	const UInt uiWidth = rect.width;
1138	const UInt uiHeight = rect.height;
1139	TComDataCU* pcCU = rTu.getCU();
1140	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1141	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1142
1143	TCoeff* piCoef = pSrc;
1144	TCoeff* piQCoef = pDes;
1145	#if ADAPTIVE_QP_SELECTION
1146	TCoeff* piArlCCoef = pArlDes;
1147	#endif
1148
1149	const Bool useTransformSkip = pcCU->getTransformSkip(uiAbsPartIdx, compID);
1150	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
1151
1152	Bool useRDOQ = useTransformSkip ? m_useRDOQTS : m_useRDOQ;
1153	if ( useRDOQ && (isLuma(compID) \|\| RDOQ_CHROMA) )
1154	{
1155	#if T0196_SELECTIVE_RDOQ
1156	if ( !m_useSelectiveRDOQ \|\| xNeedRDOQ( rTu, piCoef, compID, cQP ) )
1157	{
1158	#endif
1159	#if ADAPTIVE_QP_SELECTION
1160	xRateDistOptQuant( rTu, piCoef, pDes, pArlDes, uiAbsSum, compID, cQP );
1161	#else
1162	xRateDistOptQuant( rTu, piCoef, pDes, uiAbsSum, compID, cQP );
1163	#endif
1164	#if T0196_SELECTIVE_RDOQ
1165	}
1166	else
1167	{
1168	memset( pDes, 0, sizeof( TCoeff ) * uiWidth *uiHeight );
1169	uiAbsSum = 0;
1170	}
1171	#endif
1172	}
1173	else
1174	{
1175	TUEntropyCodingParameters codingParameters;
1176	getTUEntropyCodingParameters(codingParameters, rTu, compID);
1177
1178	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
1179	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
1180
1181	TCoeff deltaU[MAX_TU_SIZE * MAX_TU_SIZE];
1182
1183	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
1184
1185	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
1186	assert(scalingListType < SCALING_LIST_NUM);
1187	Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
1188
1189	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
1190	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
1191
1192	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
1193	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
1194	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
1195	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
1196	*/
1197
1198	// Represents scaling through forward transform
1199	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
1200	if (useTransformSkip && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
1201	{
1202	iTransformShift = std::max<Int>(0, iTransformShift);
1203	}
1204
1205	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
1206	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
1207
1208	#if ADAPTIVE_QP_SELECTION
1209	Int iQBitsC = MAX_INT;
1210	Int iAddC = MAX_INT;
1211
1212	if (m_bUseAdaptQpSelect)
1213	{
1214	iQBitsC = iQBits - ARL_C_PRECISION;
1215	iAddC = 1 << (iQBitsC-1);
1216	}
1217	#endif
1218
1219	const Int iAdd = (pcCU->getSlice()->getSliceType()==I_SLICE ? 171 : 85) << (iQBits-9);
1220	const Int qBits8 = iQBits - 8;
1221
1222	for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
1223	{
1224	const TCoeff iLevel = piCoef[uiBlockPos];
1225	const TCoeff iSign = (iLevel < 0 ? -1: 1);
1226
1227	const Int64 tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
1228
1229	#if ADAPTIVE_QP_SELECTION
1230	if( m_bUseAdaptQpSelect )
1231	{
1232	piArlCCoef[uiBlockPos] = (TCoeff)((tmpLevel + iAddC ) >> iQBitsC);
1233	}
1234	#endif
1235
1236	const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
1237	deltaU[uiBlockPos] = (TCoeff)((tmpLevel - (quantisedMagnitude<<iQBits) )>> qBits8);
1238
1239	uiAbsSum += quantisedMagnitude;
1240	const TCoeff quantisedCoefficient = quantisedMagnitude * iSign;
1241
1242	piQCoef[uiBlockPos] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
1243	} // for n
1244
1245	if( pcCU->getSlice()->getPPS()->getSignHideFlag() )
1246	{
1247	if(uiAbsSum >= 2) //this prevents TUs with only one coefficient of value 1 from being tested
1248	{
1249	signBitHidingHDQ( piQCoef, piCoef, deltaU, codingParameters, maxLog2TrDynamicRange ) ;
1250	}
1251	}
1252	} //if RDOQ
1253	//return;
1254	}
1255
1256	#if T0196_SELECTIVE_RDOQ
1257	Bool TComTrQuant::xNeedRDOQ( TComTU &rTu, TCoeff * pSrc, const ComponentID compID, const QpParam &cQP )
1258	{
1259	const TComRectangle &rect = rTu.getRect(compID);
1260	const UInt uiWidth = rect.width;
1261	const UInt uiHeight = rect.height;
1262	TComDataCU* pcCU = rTu.getCU();
1263	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1264	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1265
1266	TCoeff* piCoef = pSrc;
1267
1268	const Bool useTransformSkip = pcCU->getTransformSkip(uiAbsPartIdx, compID);
1269	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
1270
1271	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
1272
1273	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
1274	assert(scalingListType < SCALING_LIST_NUM);
1275	Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
1276
1277	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
1278	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
1279
1280	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
1281	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
1282	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
1283	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
1284	*/
1285
1286	// Represents scaling through forward transform
1287	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
1288	if (useTransformSkip && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
1289	{
1290	iTransformShift = std::max<Int>(0, iTransformShift);
1291	}
1292
1293	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
1294	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
1295
1296	// iAdd is different from the iAdd used in normal quantization
1297	const Int iAdd = (compID == COMPONENT_Y ? 171 : 256) << (iQBits-9);
1298
1299	for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
1300	{
1301	const TCoeff iLevel = piCoef[uiBlockPos];
1302	const Int64 tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
1303	const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
1304
1305	if ( quantisedMagnitude != 0 )
1306	{
1307	return true;
1308	}
1309	} // for n
1310	return false;
1311	}
1312	#endif
1313
1314	Void TComTrQuant::xDeQuant( TComTU &rTu,
1315	const TCoeff * pSrc,
1316	TCoeff * pDes,
1317	const ComponentID compID,
1318	const QpParam &cQP )
1319	{
1320	assert(compID<MAX_NUM_COMPONENT);
1321
1322	TComDataCU *pcCU = rTu.getCU();
1323	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1324	const TComRectangle &rect = rTu.getRect(compID);
1325	const UInt uiWidth = rect.width;
1326	const UInt uiHeight = rect.height;
1327	const TCoeff *const piQCoef = pSrc;
1328	TCoeff *const piCoef = pDes;
1329	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
1330	const UInt numSamplesInBlock = uiWidth*uiHeight;
1331	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
1332	const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
1333	const TCoeff transformMaximum = (1 << maxLog2TrDynamicRange) - 1;
1334	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
1335	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
1336	#if O0043_BEST_EFFORT_DECODING
1337	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
1338	#else
1339	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1340	#endif
1341
1342	assert (scalingListType < SCALING_LIST_NUM);
1343	assert ( uiWidth <= m_uiMaxTrSize );
1344
1345	// Represents scaling through forward transform
1346	const Bool bClipTransformShiftTo0 = (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
1347	const Int originalTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
1348	const Int iTransformShift = bClipTransformShiftTo0 ? std::max<Int>(0, originalTransformShift) : originalTransformShift;
1349
1350	const Int QP_per = cQP.per;
1351	const Int QP_rem = cQP.rem;
1352
1353	const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
1354
1355	if(enableScalingLists)
1356	{
1357	//from the dequantisation equation:
1358	//iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[deQuantIdx]) + iAdd ) >> rightShift
1359	//(sizeof(Intermediate_Int) * 8) = inputBitDepth + dequantCoefBits - rightShift
1360	const UInt dequantCoefBits = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
1361	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
1362
1363	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
1364	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
1365
1366	Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
1367
1368	if(rightShift > 0)
1369	{
1370	const Intermediate_Int iAdd = 1 << (rightShift - 1);
1371
1372	for( Int n = 0; n < numSamplesInBlock; n++ )
1373	{
1374	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
1375	const Intermediate_Int iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[n]) + iAdd ) >> rightShift;
1376
1377	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
1378	}
1379	}
1380	else
1381	{
1382	const Int leftShift = -rightShift;
1383
1384	for( Int n = 0; n < numSamplesInBlock; n++ )
1385	{
1386	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
1387	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * piDequantCoef[n]) << leftShift;
1388
1389	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
1390	}
1391	}
1392	}
1393	else
1394	{
1395	const Int scale = g_invQuantScales[QP_rem];
1396	const Int scaleBits = (IQUANT_SHIFT + 1) ;
1397
1398	//from the dequantisation equation:
1399	//iCoeffQ = Intermediate_Int((Int64(clipQCoef) * scale + iAdd) >> rightShift);
1400	//(sizeof(Intermediate_Int) * 8) = inputBitDepth + scaleBits - rightShift
1401	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
1402	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
1403	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
1404
1405	if (rightShift > 0)
1406	{
1407	const Intermediate_Int iAdd = 1 << (rightShift - 1);
1408
1409	for( Int n = 0; n < numSamplesInBlock; n++ )
1410	{
1411	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
1412	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
1413
1414	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
1415	}
1416	}
1417	else
1418	{
1419	const Int leftShift = -rightShift;
1420
1421	for( Int n = 0; n < numSamplesInBlock; n++ )
1422	{
1423	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
1424	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale) << leftShift;
1425
1426	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
1427	}
1428	}
1429	}
1430	}
1431
1432
1433	Void TComTrQuant::init( UInt uiMaxTrSize,
1434	Bool bUseRDOQ,
1435	Bool bUseRDOQTS,
1436	#if T0196_SELECTIVE_RDOQ
1437	Bool useSelectiveRDOQ,
1438	#endif
1439	Bool bEnc,
1440	Bool useTransformSkipFast
1441	#if ADAPTIVE_QP_SELECTION
1442	, Bool bUseAdaptQpSelect
1443	#endif
1444	)
1445	{
1446	m_uiMaxTrSize = uiMaxTrSize;
1447	m_bEnc = bEnc;
1448	m_useRDOQ = bUseRDOQ;
1449	m_useRDOQTS = bUseRDOQTS;
1450	#if T0196_SELECTIVE_RDOQ
1451	m_useSelectiveRDOQ = useSelectiveRDOQ;
1452	#endif
1453	#if ADAPTIVE_QP_SELECTION
1454	m_bUseAdaptQpSelect = bUseAdaptQpSelect;
1455	#endif
1456	m_useTransformSkipFast = useTransformSkipFast;
1457	}
1458
1459
1460	Void TComTrQuant::transformNxN( TComTU & rTu,
1461	const ComponentID compID,
1462	Pel * pcResidual,
1463	const UInt uiStride,
1464	TCoeff * rpcCoeff,
1465	#if ADAPTIVE_QP_SELECTION
1466	TCoeff * pcArlCoeff,
1467	#endif
1468	TCoeff & uiAbsSum,
1469	const QpParam & cQP
1470	)
1471	{
1472	const TComRectangle &rect = rTu.getRect(compID);
1473	const UInt uiWidth = rect.width;
1474	const UInt uiHeight = rect.height;
1475	TComDataCU* pcCU = rTu.getCU();
1476	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1477	const UInt uiOrgTrDepth = rTu.GetTransformDepthRel();
1478
1479	uiAbsSum=0;
1480
1481	RDPCMMode rdpcmMode = RDPCM_OFF;
1482	rdpcmNxN( rTu, compID, pcResidual, uiStride, cQP, rpcCoeff, uiAbsSum, rdpcmMode );
1483
1484	if (rdpcmMode == RDPCM_OFF)
1485	{
1486	uiAbsSum = 0;
1487	//transform and quantise
1488	if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
1489	{
1490	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
1491	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
1492
1493	for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
1494	{
1495	for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
1496	{
1497	const Pel currentSample = pcResidual[(y * uiStride) + x];
1498
1499	rpcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = currentSample;
1500	uiAbsSum += TCoeff(abs(currentSample));
1501	}
1502	}
1503	}
1504	else
1505	{
1506	#if DEBUG_TRANSFORM_AND_QUANTISE
1507	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to transform\n";
1508	printBlock(pcResidual, uiWidth, uiHeight, uiStride);
1509	#endif
1510
1511	assert( (pcCU->getSlice()->getSPS()->getMaxTrSize() >= uiWidth) );
1512
1513	if(pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0)
1514	{
1515	xTransformSkip( pcResidual, uiStride, m_plTempCoeff, rTu, compID );
1516	}
1517	else
1518	{
1519	const Int channelBitDepth=pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1520	xT( channelBitDepth, rTu.useDST(compID), pcResidual, uiStride, m_plTempCoeff, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
1521	}
1522
1523	#if DEBUG_TRANSFORM_AND_QUANTISE
1524	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between transform and quantiser\n";
1525	printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
1526	#endif
1527
1528	xQuant( rTu, m_plTempCoeff, rpcCoeff,
1529
1530	#if ADAPTIVE_QP_SELECTION
1531	pcArlCoeff,
1532	#endif
1533	uiAbsSum, compID, cQP );
1534
1535	#if DEBUG_TRANSFORM_AND_QUANTISE
1536	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of quantiser\n";
1537	printBlock(rpcCoeff, uiWidth, uiHeight, uiWidth);
1538	#endif
1539	}
1540	}
1541
1542	//set the CBF
1543	pcCU->setCbfPartRange((((uiAbsSum > 0) ? 1 : 0) << uiOrgTrDepth), compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
1544	}
1545
1546
1547	Void TComTrQuant::invTransformNxN( TComTU &rTu,
1548	const ComponentID compID,
1549	Pel *pcResidual,
1550	const UInt uiStride,
1551	TCoeff * pcCoeff,
1552	const QpParam &cQP
1553	DEBUG_STRING_FN_DECLAREP(psDebug))
1554	{
1555	TComDataCU* pcCU=rTu.getCU();
1556	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1557	const TComRectangle &rect = rTu.getRect(compID);
1558	const UInt uiWidth = rect.width;
1559	const UInt uiHeight = rect.height;
1560
1561	if (uiWidth != uiHeight) //for intra, the TU will have been split above this level, so this condition won't be true, hence this only affects inter
1562	{
1563	//------------------------------------------------
1564
1565	//recurse deeper
1566
1567	TComTURecurse subTURecurse(rTu, false, TComTU::VERTICAL_SPLIT, true, compID);
1568
1569	do
1570	{
1571	//------------------
1572
1573	const UInt lineOffset = subTURecurse.GetSectionNumber() * subTURecurse.getRect(compID).height;
1574
1575	Pel subTUResidual = pcResidual + (lineOffset uiStride);
1576	TCoeff subTUCoefficients = pcCoeff + (lineOffset subTURecurse.getRect(compID).width);
1577
1578	invTransformNxN(subTURecurse, compID, subTUResidual, uiStride, subTUCoefficients, cQP DEBUG_STRING_PASS_INTO(psDebug));
1579
1580	//------------------
1581
1582	} while (subTURecurse.nextSection(rTu));
1583
1584	//------------------------------------------------
1585
1586	return;
1587	}
1588
1589	#if DEBUG_STRING
1590	if (psDebug)
1591	{
1592	std::stringstream ss(stringstream::out);
1593	printBlockToStream(ss, (compID==0)?"###InvTran ip Ch0: " : ((compID==1)?"###InvTran ip Ch1: ":"###InvTran ip Ch2: "), pcCoeff, uiWidth, uiHeight, uiWidth);
1594	DEBUG_STRING_APPEND((*psDebug), ss.str())
1595	}
1596	#endif
1597
1598	if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
1599	{
1600	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
1601	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
1602
1603	for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
1604	{
1605	for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
1606	{
1607	pcResidual[(y * uiStride) + x] = Pel(pcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex]);
1608	}
1609	}
1610	}
1611	else
1612	{
1613	#if DEBUG_TRANSFORM_AND_QUANTISE
1614	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to dequantiser\n";
1615	printBlock(pcCoeff, uiWidth, uiHeight, uiWidth);
1616	#endif
1617
1618	xDeQuant(rTu, pcCoeff, m_plTempCoeff, compID, cQP);
1619
1620	#if DEBUG_TRANSFORM_AND_QUANTISE
1621	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between dequantiser and inverse-transform\n";
1622	printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
1623	#endif
1624
1625	#if DEBUG_STRING
1626	if (psDebug)
1627	{
1628	std::stringstream ss(stringstream::out);
1629	printBlockToStream(ss, "###InvTran deq: ", m_plTempCoeff, uiWidth, uiHeight, uiWidth);
1630	(*psDebug)+=ss.str();
1631	}
1632	#endif
1633
1634	if(pcCU->getTransformSkip(uiAbsPartIdx, compID))
1635	{
1636	xITransformSkip( m_plTempCoeff, pcResidual, uiStride, rTu, compID );
1637
1638	#if DEBUG_STRING
1639	if (psDebug)
1640	{
1641	std::stringstream ss(stringstream::out);
1642	printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
1643	(*psDebug)+=ss.str();
1644	(*psDebug)+="(<- was a Transform-skipped block)\n";
1645	}
1646	#endif
1647	}
1648	else
1649	{
1650	#if O0043_BEST_EFFORT_DECODING
1651	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
1652	#else
1653	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1654	#endif
1655	xIT( channelBitDepth, rTu.useDST(compID), m_plTempCoeff, pcResidual, uiStride, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
1656
1657	#if DEBUG_STRING
1658	if (psDebug)
1659	{
1660	std::stringstream ss(stringstream::out);
1661	printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
1662	(*psDebug)+=ss.str();
1663	(*psDebug)+="(<- was a Transformed block)\n";
1664	}
1665	#endif
1666	}
1667
1668	#if DEBUG_TRANSFORM_AND_QUANTISE
1669	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of inverse-transform\n";
1670	printBlock(pcResidual, uiWidth, uiHeight, uiStride);
1671	g_debugCounter++;
1672	#endif
1673	}
1674
1675	invRdpcmNxN( rTu, compID, pcResidual, uiStride );
1676	}
1677
1678	Void TComTrQuant::invRecurTransformNxN( const ComponentID compID,
1679	TComYuv *pResidual,
1680	TComTU &rTu)
1681	{
1682	if (!rTu.ProcessComponentSection(compID))
1683	{
1684	return;
1685	}
1686
1687	TComDataCU* pcCU = rTu.getCU();
1688	UInt absPartIdxTU = rTu.GetAbsPartIdxTU();
1689	UInt uiTrMode=rTu.GetTransformDepthRel();
1690	if( (pcCU->getCbf(absPartIdxTU, compID, uiTrMode) == 0) && (isLuma(compID) \|\| !pcCU->getSlice()->getPPS()->getPpsRangeExtension().getCrossComponentPredictionEnabledFlag()) )
1691	{
1692	return;
1693	}
1694
1695	if( uiTrMode == pcCU->getTransformIdx( absPartIdxTU ) )
1696	{
1697	const TComRectangle &tuRect = rTu.getRect(compID);
1698	const Int uiStride = pResidual->getStride( compID );
1699	Pel *rpcResidual = pResidual->getAddr( compID );
1700	UInt uiAddr = (tuRect.x0 + uiStride*tuRect.y0);
1701	Pel *pResi = rpcResidual + uiAddr;
1702	TCoeff *pcCoeff = pcCU->getCoeff(compID) + rTu.getCoefficientOffset(compID);
1703
1704	const QpParam cQP(*pcCU, compID);
1705
1706	if(pcCU->getCbf(absPartIdxTU, compID, uiTrMode) != 0)
1707	{
1708	DEBUG_STRING_NEW(sTemp)
1709	#if DEBUG_STRING
1710	std::string *psDebug=((DebugOptionList::DebugString_InvTran.getInt()&(pcCU->isIntra(absPartIdxTU)?1:(pcCU->isInter(absPartIdxTU)?2:4)))!=0) ? &sTemp : 0;
1711	#endif
1712
1713	invTransformNxN( rTu, compID, pResi, uiStride, pcCoeff, cQP DEBUG_STRING_PASS_INTO(psDebug) );
1714
1715	#if DEBUG_STRING
1716	if (psDebug != 0)
1717	{
1718	std::cout << (*psDebug);
1719	}
1720	#endif
1721	}
1722
1723	if (isChroma(compID) && (pcCU->getCrossComponentPredictionAlpha(absPartIdxTU, compID) != 0))
1724	{
1725	const Pel *piResiLuma = pResidual->getAddr( COMPONENT_Y );
1726	const Int strideLuma = pResidual->getStride( COMPONENT_Y );
1727	const Int tuWidth = rTu.getRect( compID ).width;
1728	const Int tuHeight = rTu.getRect( compID ).height;
1729
1730	if(pcCU->getCbf(absPartIdxTU, COMPONENT_Y, uiTrMode) != 0)
1731	{
1732	pResi = rpcResidual + uiAddr;
1733	const Pel *pResiLuma = piResiLuma + uiAddr;
1734
1735	crossComponentPrediction( rTu, compID, pResiLuma, pResi, pResi, tuWidth, tuHeight, strideLuma, uiStride, uiStride, true );
1736	}
1737	}
1738	}
1739	else
1740	{
1741	TComTURecurse tuRecurseChild(rTu, false);
1742	do
1743	{
1744	invRecurTransformNxN( compID, pResidual, tuRecurseChild );
1745	} while (tuRecurseChild.nextSection(rTu));
1746	}
1747	}
1748
1749	Void TComTrQuant::applyForwardRDPCM( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, const RDPCMMode mode )
1750	{
1751	TComDataCU *pcCU=rTu.getCU();
1752	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
1753
1754	const Bool bLossless = pcCU->getCUTransquantBypass( uiAbsPartIdx );
1755	const UInt uiWidth = rTu.getRect(compID).width;
1756	const UInt uiHeight = rTu.getRect(compID).height;
1757	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
1758	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
1759
1760	UInt uiX = 0;
1761	UInt uiY = 0;
1762
1763	UInt &majorAxis = (mode == RDPCM_VER) ? uiX : uiY;
1764	UInt &minorAxis = (mode == RDPCM_VER) ? uiY : uiX;
1765	const UInt majorAxisLimit = (mode == RDPCM_VER) ? uiWidth : uiHeight;
1766	const UInt minorAxisLimit = (mode == RDPCM_VER) ? uiHeight : uiWidth;
1767
1768	const Bool bUseHalfRoundingPoint = (mode != RDPCM_OFF);
1769
1770	uiAbsSum = 0;
1771
1772	for ( majorAxis = 0; majorAxis < majorAxisLimit; majorAxis++ )
1773	{
1774	TCoeff accumulatorValue = 0; // 32-bit accumulator
1775	for ( minorAxis = 0; minorAxis < minorAxisLimit; minorAxis++ )
1776	{
1777	const UInt sampleIndex = (uiY * uiWidth) + uiX;
1778	const UInt coefficientIndex = (rotateResidual ? (uiSizeMinus1-sampleIndex) : sampleIndex);
1779	const Pel currentSample = pcResidual[(uiY * uiStride) + uiX];
1780	const TCoeff encoderSideDelta = TCoeff(currentSample) - accumulatorValue;
1781
1782	Pel reconstructedDelta;
1783	if ( bLossless )
1784	{
1785	pcCoeff[coefficientIndex] = encoderSideDelta;
1786	reconstructedDelta = (Pel) encoderSideDelta;
1787	}
1788	else
1789	{
1790	transformSkipQuantOneSample(rTu, compID, encoderSideDelta, pcCoeff, coefficientIndex, cQP, bUseHalfRoundingPoint);
1791	invTrSkipDeQuantOneSample (rTu, compID, pcCoeff[coefficientIndex], reconstructedDelta, cQP, coefficientIndex);
1792	}
1793
1794	uiAbsSum += abs(pcCoeff[coefficientIndex]);
1795
1796	if (mode != RDPCM_OFF)
1797	{
1798	accumulatorValue += reconstructedDelta;
1799	}
1800	}
1801	}
1802	}
1803
1804	Void TComTrQuant::rdpcmNxN ( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, RDPCMMode& rdpcmMode )
1805	{
1806	TComDataCU *pcCU=rTu.getCU();
1807	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
1808
1809	if (!pcCU->isRDPCMEnabled(uiAbsPartIdx) \|\| ((pcCU->getTransformSkip(uiAbsPartIdx, compID) == 0) && !pcCU->getCUTransquantBypass(uiAbsPartIdx)))
1810	{
1811	rdpcmMode = RDPCM_OFF;
1812	}
1813	else if ( pcCU->isIntra( uiAbsPartIdx ) )
1814	{
1815	const ChromaFormat chFmt = pcCU->getPic()->getPicYuvOrg()->getChromaFormat();
1816	const ChannelType chType = toChannelType(compID);
1817	const UInt uiChPredMode = pcCU->getIntraDir( chType, uiAbsPartIdx );
1818	const TComSPS *sps=pcCU->getSlice()->getSPS();
1819	const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
1820	const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
1821	const UInt uiChFinalMode = ((chFmt == CHROMA_422) && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
1822
1823	if (uiChFinalMode == VER_IDX \|\| uiChFinalMode == HOR_IDX)
1824	{
1825	rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
1826	applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, uiAbsSum, rdpcmMode );
1827	}
1828	else
1829	{
1830	rdpcmMode = RDPCM_OFF;
1831	}
1832	}
1833	else // not intra, need to select the best mode
1834	{
1835	const UInt uiWidth = rTu.getRect(compID).width;
1836	const UInt uiHeight = rTu.getRect(compID).height;
1837
1838	RDPCMMode bestMode = NUMBER_OF_RDPCM_MODES;
1839	TCoeff bestAbsSum = std::numeric_limits<TCoeff>::max();
1840	TCoeff bestCoefficients[MAX_TU_SIZE * MAX_TU_SIZE];
1841
1842	for (UInt modeIndex = 0; modeIndex < NUMBER_OF_RDPCM_MODES; modeIndex++)
1843	{
1844	const RDPCMMode mode = RDPCMMode(modeIndex);
1845
1846	TCoeff currAbsSum = 0;
1847
1848	applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, currAbsSum, mode );
1849
1850	if (currAbsSum < bestAbsSum)
1851	{
1852	bestMode = mode;
1853	bestAbsSum = currAbsSum;
1854	if (mode != RDPCM_OFF)
1855	{
1856	memcpy(bestCoefficients, pcCoeff, (uiWidth * uiHeight * sizeof(TCoeff)));
1857	}
1858	}
1859	}
1860
1861	rdpcmMode = bestMode;
1862	uiAbsSum = bestAbsSum;
1863
1864	if (rdpcmMode != RDPCM_OFF) //the TU is re-transformed and quantised if DPCM_OFF is returned, so there is no need to preserve it here
1865	{
1866	memcpy(pcCoeff, bestCoefficients, (uiWidth * uiHeight * sizeof(TCoeff)));
1867	}
1868	}
1869
1870	pcCU->setExplicitRdpcmModePartRange(rdpcmMode, compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
1871	}
1872
1873	Void TComTrQuant::invRdpcmNxN( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride )
1874	{
1875	TComDataCU *pcCU=rTu.getCU();
1876	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
1877
1878	if (pcCU->isRDPCMEnabled( uiAbsPartIdx ) && ((pcCU->getTransformSkip(uiAbsPartIdx, compID ) != 0) \|\| pcCU->getCUTransquantBypass(uiAbsPartIdx)))
1879	{
1880	const UInt uiWidth = rTu.getRect(compID).width;
1881	const UInt uiHeight = rTu.getRect(compID).height;
1882
1883	RDPCMMode rdpcmMode = RDPCM_OFF;
1884
1885	if ( pcCU->isIntra( uiAbsPartIdx ) )
1886	{
1887	const ChromaFormat chFmt = pcCU->getPic()->getPicYuvRec()->getChromaFormat();
1888	const ChannelType chType = toChannelType(compID);
1889	const UInt uiChPredMode = pcCU->getIntraDir( chType, uiAbsPartIdx );
1890	const TComSPS *sps=pcCU->getSlice()->getSPS();
1891	const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
1892	const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
1893	const UInt uiChFinalMode = ((chFmt == CHROMA_422) && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
1894
1895	if (uiChFinalMode == VER_IDX \|\| uiChFinalMode == HOR_IDX)
1896	{
1897	rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
1898	}
1899	}
1900	else // not intra case
1901	{
1902	rdpcmMode = RDPCMMode(pcCU->getExplicitRdpcmMode( compID, uiAbsPartIdx ));
1903	}
1904
1905	const TCoeff pelMin=(TCoeff) std::numeric_limits<Pel>::min();
1906	const TCoeff pelMax=(TCoeff) std::numeric_limits<Pel>::max();
1907	if (rdpcmMode == RDPCM_VER)
1908	{
1909	for( UInt uiX = 0; uiX < uiWidth; uiX++ )
1910	{
1911	Pel *pcCurResidual = pcResidual+uiX;
1912	TCoeff accumulator = *pcCurResidual; // 32-bit accumulator
1913	pcCurResidual+=uiStride;
1914	for( UInt uiY = 1; uiY < uiHeight; uiY++, pcCurResidual+=uiStride )
1915	{
1916	accumulator += *(pcCurResidual);
1917	*pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
1918	}
1919	}
1920	}
1921	else if (rdpcmMode == RDPCM_HOR)
1922	{
1923	for( UInt uiY = 0; uiY < uiHeight; uiY++ )
1924	{
1925	Pel pcCurResidual = pcResidual+uiYuiStride;
1926	TCoeff accumulator = *pcCurResidual;
1927	pcCurResidual++;
1928	for( UInt uiX = 1; uiX < uiWidth; uiX++, pcCurResidual++ )
1929	{
1930	accumulator += *(pcCurResidual);
1931	*pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
1932	}
1933	}
1934	}
1935	}
1936	}
1937
1938	// ------------------------------------------------------------------------------------------------
1939	// Logical transform
1940	// ------------------------------------------------------------------------------------------------
1941
1942	/** Wrapper function between HM interface and core NxN forward transform (2D)
1943	* \param channelBitDepth bit depth of channel
1944	* \param useDST
1945	* \param piBlkResi input data (residual)
1946	* \param uiStride stride of input residual data
1947	* \param psCoeff output data (transform coefficients)
1948	* \param iWidth transform width
1949	* \param iHeight transform height
1950	* \param maxLog2TrDynamicRange
1951	*/
1952	Void TComTrQuant::xT( const Int channelBitDepth, Bool useDST, Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
1953	{
1954	#if MATRIX_MULT
1955	if( iWidth == iHeight)
1956	{
1957	xTr(channelBitDepth, piBlkResi, psCoeff, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
1958	return;
1959	}
1960	#endif
1961
1962	TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
1963	TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
1964
1965	for (Int y = 0; y < iHeight; y++)
1966	{
1967	for (Int x = 0; x < iWidth; x++)
1968	{
1969	block[(y * iWidth) + x] = piBlkResi[(y * uiStride) + x];
1970	}
1971	}
1972
1973	xTrMxN( channelBitDepth, block, coeff, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
1974
1975	memcpy(psCoeff, coeff, (iWidth * iHeight * sizeof(TCoeff)));
1976	}
1977
1978	/** Wrapper function between HM interface and core NxN inverse transform (2D)
1979	* \param channelBitDepth bit depth of channel
1980	* \param useDST
1981	* \param plCoef input data (transform coefficients)
1982	* \param pResidual output data (residual)
1983	* \param uiStride stride of input residual data
1984	* \param iWidth transform width
1985	* \param iHeight transform height
1986	* \param maxLog2TrDynamicRange
1987	*/
1988	Void TComTrQuant::xIT( const Int channelBitDepth, Bool useDST, TCoeff* plCoef, Pel* pResidual, UInt uiStride, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
1989	{
1990	#if MATRIX_MULT
1991	if( iWidth == iHeight )
1992	{
1993	xITr(channelBitDepth, plCoef, pResidual, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
1994	return;
1995	}
1996	#endif
1997
1998	TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
1999	TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
2000
2001	memcpy(coeff, plCoef, (iWidth * iHeight * sizeof(TCoeff)));
2002
2003	xITrMxN( channelBitDepth, coeff, block, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
2004
2005	for (Int y = 0; y < iHeight; y++)
2006	{
2007	for (Int x = 0; x < iWidth; x++)
2008	{
2009	pResidual[(y * uiStride) + x] = Pel(block[(y * iWidth) + x]);
2010	}
2011	}
2012	}
2013
2014	/** Wrapper function between HM interface and core 4x4 transform skipping
2015	* \param piBlkResi input data (residual)
2016	* \param uiStride stride of input residual data
2017	* \param psCoeff output data (transform coefficients)
2018	* \param rTu reference to transform data
2019	* \param component colour component
2020	*/
2021	Void TComTrQuant::xTransformSkip( Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, TComTU &rTu, const ComponentID component )
2022	{
2023	const TComRectangle &rect = rTu.getRect(component);
2024	const Int width = rect.width;
2025	const Int height = rect.height;
2026	const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
2027	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
2028
2029	Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
2030	if (rTu.getCU()->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
2031	{
2032	iTransformShift = std::max<Int>(0, iTransformShift);
2033	}
2034
2035	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
2036	const UInt uiSizeMinus1 = (width * height) - 1;
2037
2038	if (iTransformShift >= 0)
2039	{
2040	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
2041	{
2042	for (UInt x = 0; x < width; x++, coefficientIndex++)
2043	{
2044	psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = TCoeff(piBlkResi[(y * uiStride) + x]) << iTransformShift;
2045	}
2046	}
2047	}
2048	else //for very high bit depths
2049	{
2050	iTransformShift = -iTransformShift;
2051	const TCoeff offset = 1 << (iTransformShift - 1);
2052
2053	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
2054	{
2055	for (UInt x = 0; x < width; x++, coefficientIndex++)
2056	{
2057	psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = (TCoeff(piBlkResi[(y * uiStride) + x]) + offset) >> iTransformShift;
2058	}
2059	}
2060	}
2061	}
2062
2063	/** Wrapper function between HM interface and core NxN transform skipping
2064	* \param plCoef input data (coefficients)
2065	* \param pResidual output data (residual)
2066	* \param uiStride stride of input residual data
2067	* \param rTu reference to transform data
2068	* \param component colour component ID
2069	*/
2070	Void TComTrQuant::xITransformSkip( TCoeff* plCoef, Pel* pResidual, UInt uiStride, TComTU &rTu, const ComponentID component )
2071	{
2072	const TComRectangle &rect = rTu.getRect(component);
2073	const Int width = rect.width;
2074	const Int height = rect.height;
2075	const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
2076	#if O0043_BEST_EFFORT_DECODING
2077	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getStreamBitDepth(toChannelType(component));
2078	#else
2079	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
2080	#endif
2081
2082	Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
2083	if (rTu.getCU()->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag())
2084	{
2085	iTransformShift = std::max<Int>(0, iTransformShift);
2086	}
2087
2088	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
2089	const UInt uiSizeMinus1 = (width * height) - 1;
2090
2091	if (iTransformShift >= 0)
2092	{
2093	const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
2094
2095	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
2096	{
2097	for (UInt x = 0; x < width; x++, coefficientIndex++)
2098	{
2099	pResidual[(y * uiStride) + x] = Pel((plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] + offset) >> iTransformShift);
2100	}
2101	}
2102	}
2103	else //for very high bit depths
2104	{
2105	iTransformShift = -iTransformShift;
2106
2107	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
2108	{
2109	for (UInt x = 0; x < width; x++, coefficientIndex++)
2110	{
2111	pResidual[(y * uiStride) + x] = Pel(plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] << iTransformShift);
2112	}
2113	}
2114	}
2115	}
2116
2117	/** RDOQ with CABAC
2118	* \param rTu reference to transform data
2119	* \param plSrcCoeff pointer to input buffer
2120	* \param piDstCoeff reference to pointer to output buffer
2121	* \param piArlDstCoeff
2122	* \param uiAbsSum reference to absolute sum of quantized transform coefficient
2123	* \param compID colour component ID
2124	* \param cQP reference to quantization parameters
2125
2126	* Rate distortion optimized quantization for entropy
2127	* coding engines using probability models like CABAC
2128	*/
2129	Void TComTrQuant::xRateDistOptQuant ( TComTU &rTu,
2130	TCoeff * plSrcCoeff,
2131	TCoeff * piDstCoeff,
2132	#if ADAPTIVE_QP_SELECTION
2133	TCoeff * piArlDstCoeff,
2134	#endif
2135	TCoeff &uiAbsSum,
2136	const ComponentID compID,
2137	const QpParam &cQP )
2138	{
2139	const TComRectangle & rect = rTu.getRect(compID);
2140	const UInt uiWidth = rect.width;
2141	const UInt uiHeight = rect.height;
2142	TComDataCU * pcCU = rTu.getCU();
2143	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
2144	const ChannelType channelType = toChannelType(compID);
2145	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
2146
2147	const Bool extendedPrecision = pcCU->getSlice()->getSPS()->getSpsRangeExtension().getExtendedPrecisionProcessingFlag();
2148	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
2149	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(channelType);
2150
2151	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
2152	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
2153	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
2154	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
2155	*/
2156
2157	// Represents scaling through forward transform
2158	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
2159	if ((pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && extendedPrecision)
2160	{
2161	iTransformShift = std::max<Int>(0, iTransformShift);
2162	}
2163
2164	const Bool bUseGolombRiceParameterAdaptation = pcCU->getSlice()->getSPS()->getSpsRangeExtension().getPersistentRiceAdaptationEnabledFlag();
2165	const UInt initialGolombRiceParameter = m_pcEstBitsSbac->golombRiceAdaptationStatistics[rTu.getGolombRiceStatisticsIndex(compID)] / RExt__GOLOMB_RICE_INCREMENT_DIVISOR;
2166	UInt uiGoRiceParam = initialGolombRiceParameter;
2167	Double d64BlockUncodedCost = 0;
2168	const UInt uiLog2BlockWidth = g_aucConvertToBit[ uiWidth ] + 2;
2169	const UInt uiLog2BlockHeight = g_aucConvertToBit[ uiHeight ] + 2;
2170	const UInt uiMaxNumCoeff = uiWidth * uiHeight;
2171	assert(compID<MAX_NUM_COMPONENT);
2172
2173	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
2174	assert(scalingListType < SCALING_LIST_NUM);
2175
2176	#if ADAPTIVE_QP_SELECTION
2177	memset(piArlDstCoeff, 0, sizeof(TCoeff) * uiMaxNumCoeff);
2178	#endif
2179
2180	Double pdCostCoeff [ MAX_TU_SIZE * MAX_TU_SIZE ];
2181	Double pdCostSig [ MAX_TU_SIZE * MAX_TU_SIZE ];
2182	Double pdCostCoeff0[ MAX_TU_SIZE * MAX_TU_SIZE ];
2183	memset( pdCostCoeff, 0, sizeof(Double) * uiMaxNumCoeff );
2184	memset( pdCostSig, 0, sizeof(Double) * uiMaxNumCoeff );
2185	Int rateIncUp [ MAX_TU_SIZE * MAX_TU_SIZE ];
2186	Int rateIncDown [ MAX_TU_SIZE * MAX_TU_SIZE ];
2187	Int sigRateDelta[ MAX_TU_SIZE * MAX_TU_SIZE ];
2188	TCoeff deltaU [ MAX_TU_SIZE * MAX_TU_SIZE ];
2189	memset( rateIncUp, 0, sizeof(Int ) * uiMaxNumCoeff );
2190	memset( rateIncDown, 0, sizeof(Int ) * uiMaxNumCoeff );
2191	memset( sigRateDelta, 0, sizeof(Int ) * uiMaxNumCoeff );
2192	memset( deltaU, 0, sizeof(TCoeff) * uiMaxNumCoeff );
2193
2194	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift; // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits
2195	const Double *const pdErrScale = getErrScaleCoeff(scalingListType, (uiLog2TrSize-2), cQP.rem);
2196	const Int *const piQCoef = getQuantCoeff(scalingListType, cQP.rem, (uiLog2TrSize-2));
2197
2198	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
2199	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
2200	const Double defaultErrorScale = getErrScaleCoeffNoScalingList(scalingListType, (uiLog2TrSize-2), cQP.rem);
2201
2202	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
2203	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
2204
2205	#if ADAPTIVE_QP_SELECTION
2206	Int iQBitsC = iQBits - ARL_C_PRECISION;
2207	Int iAddC = 1 << (iQBitsC-1);
2208	#endif
2209
2210	TUEntropyCodingParameters codingParameters;
2211	getTUEntropyCodingParameters(codingParameters, rTu, compID);
2212	const UInt uiCGSize = (1 << MLS_CG_SIZE);
2213
2214	Double pdCostCoeffGroupSig[ MLS_GRP_NUM ];
2215	UInt uiSigCoeffGroupFlag[ MLS_GRP_NUM ];
2216	Int iCGLastScanPos = -1;
2217
2218	UInt uiCtxSet = 0;
2219	Int c1 = 1;
2220	Int c2 = 0;
2221	Double d64BaseCost = 0;
2222	Int iLastScanPos = -1;
2223
2224	UInt c1Idx = 0;
2225	UInt c2Idx = 0;
2226	Int baseLevel;
2227
2228	memset( pdCostCoeffGroupSig, 0, sizeof(Double) * MLS_GRP_NUM );
2229	memset( uiSigCoeffGroupFlag, 0, sizeof(UInt) * MLS_GRP_NUM );
2230
2231	UInt uiCGNum = uiWidth * uiHeight >> MLS_CG_SIZE;
2232	Int iScanPos;
2233	coeffGroupRDStats rdStats;
2234
2235	const UInt significanceMapContextOffset = getSignificanceMapContextOffset(compID);
2236
2237	for (Int iCGScanPos = uiCGNum-1; iCGScanPos >= 0; iCGScanPos--)
2238	{
2239	UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
2240	UInt uiCGPosY = uiCGBlkPos / codingParameters.widthInGroups;
2241	UInt uiCGPosX = uiCGBlkPos - (uiCGPosY * codingParameters.widthInGroups);
2242
2243	memset( &rdStats, 0, sizeof (coeffGroupRDStats));
2244
2245	const Int patternSigCtx = TComTrQuant::calcPatternSigCtx(uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups);
2246
2247	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
2248	{
2249	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
2250	//===== quantization =====
2251	UInt uiBlkPos = codingParameters.scan[iScanPos];
2252	// set coeff
2253
2254	const Int quantisationCoefficient = (enableScalingLists) ? piQCoef [uiBlkPos] : defaultQuantisationCoefficient;
2255	const Double errorScale = (enableScalingLists) ? pdErrScale[uiBlkPos] : defaultErrorScale;
2256
2257	const Int64 tmpLevel = Int64(abs(plSrcCoeff[ uiBlkPos ])) * quantisationCoefficient;
2258
2259	const Intermediate_Int lLevelDouble = (Intermediate_Int)min<Int64>(tmpLevel, std::numeric_limits<Intermediate_Int>::max() - (Intermediate_Int(1) << (iQBits - 1)));
2260
2261	#if ADAPTIVE_QP_SELECTION
2262	if( m_bUseAdaptQpSelect )
2263	{
2264	piArlDstCoeff[uiBlkPos] = (TCoeff)(( lLevelDouble + iAddC) >> iQBitsC );
2265	}
2266	#endif
2267	const UInt uiMaxAbsLevel = std::min<UInt>(UInt(entropyCodingMaximum), UInt((lLevelDouble + (Intermediate_Int(1) << (iQBits - 1))) >> iQBits));
2268
2269	const Double dErr = Double( lLevelDouble );
2270	pdCostCoeff0[ iScanPos ] = dErr * dErr * errorScale;
2271	d64BlockUncodedCost += pdCostCoeff0[ iScanPos ];
2272	piDstCoeff[ uiBlkPos ] = uiMaxAbsLevel;
2273
2274	if ( uiMaxAbsLevel > 0 && iLastScanPos < 0 )
2275	{
2276	iLastScanPos = iScanPos;
2277	uiCtxSet = getContextSetIndex(compID, (iScanPos >> MLS_CG_SIZE), 0);
2278	iCGLastScanPos = iCGScanPos;
2279	}
2280
2281	if ( iLastScanPos >= 0 )
2282	{
2283	//===== coefficient level estimation =====
2284	UInt uiLevel;
2285	UInt uiOneCtx = (NUM_ONE_FLAG_CTX_PER_SET * uiCtxSet) + c1;
2286	UInt uiAbsCtx = (NUM_ABS_FLAG_CTX_PER_SET * uiCtxSet) + c2;
2287
2288	if( iScanPos == iLastScanPos )
2289	{
2290	uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
2291	lLevelDouble, uiMaxAbsLevel, significanceMapContextOffset, uiOneCtx, uiAbsCtx, uiGoRiceParam,
2292	c1Idx, c2Idx, iQBits, errorScale, 1, extendedPrecision, maxLog2TrDynamicRange
2293	);
2294	}
2295	else
2296	{
2297	UShort uiCtxSig = significanceMapContextOffset + getSigCtxInc( patternSigCtx, codingParameters, iScanPos, uiLog2BlockWidth, uiLog2BlockHeight, channelType );
2298
2299	uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
2300	lLevelDouble, uiMaxAbsLevel, uiCtxSig, uiOneCtx, uiAbsCtx, uiGoRiceParam,
2301	c1Idx, c2Idx, iQBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange
2302	);
2303
2304	sigRateDelta[ uiBlkPos ] = m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 1 ] - m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 0 ];
2305	}
2306
2307	deltaU[ uiBlkPos ] = TCoeff((lLevelDouble - (Intermediate_Int(uiLevel) << iQBits)) >> (iQBits-8));
2308
2309	if( uiLevel > 0 )
2310	{
2311	Int rateNow = xGetICRate( uiLevel, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange );
2312	rateIncUp [ uiBlkPos ] = xGetICRate( uiLevel+1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
2313	rateIncDown [ uiBlkPos ] = xGetICRate( uiLevel-1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
2314	}
2315	else // uiLevel == 0
2316	{
2317	rateIncUp [ uiBlkPos ] = m_pcEstBitsSbac->m_greaterOneBits[ uiOneCtx ][ 0 ];
2318	}
2319	piDstCoeff[ uiBlkPos ] = uiLevel;
2320	d64BaseCost += pdCostCoeff [ iScanPos ];
2321
2322	baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
2323	if( uiLevel >= baseLevel )
2324	{
2325	if (uiLevel > 3*(1<<uiGoRiceParam))
2326	{
2327	uiGoRiceParam = bUseGolombRiceParameterAdaptation ? (uiGoRiceParam + 1) : (std::min<UInt>((uiGoRiceParam + 1), 4));
2328	}
2329	}
2330	if ( uiLevel >= 1)
2331	{
2332	c1Idx ++;
2333	}
2334
2335	//===== update bin model =====
2336	if( uiLevel > 1 )
2337	{
2338	c1 = 0;
2339	c2 += (c2 < 2);
2340	c2Idx ++;
2341	}
2342	else if( (c1 < 3) && (c1 > 0) && uiLevel)
2343	{
2344	c1++;
2345	}
2346
2347	//===== context set update =====
2348	if( ( iScanPos % uiCGSize == 0 ) && ( iScanPos > 0 ) )
2349	{
2350	uiCtxSet = getContextSetIndex(compID, ((iScanPos - 1) >> MLS_CG_SIZE), (c1 == 0)); //(iScanPos - 1) because we do this before entering the final group
2351	c1 = 1;
2352	c2 = 0;
2353	c1Idx = 0;
2354	c2Idx = 0;
2355	uiGoRiceParam = initialGolombRiceParameter;
2356	}
2357	}
2358	else
2359	{
2360	d64BaseCost += pdCostCoeff0[ iScanPos ];
2361	}
2362	rdStats.d64SigCost += pdCostSig[ iScanPos ];
2363	if (iScanPosinCG == 0 )
2364	{
2365	rdStats.d64SigCost_0 = pdCostSig[ iScanPos ];
2366	}
2367	if (piDstCoeff[ uiBlkPos ] )
2368	{
2369	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 1;
2370	rdStats.d64CodedLevelandDist += pdCostCoeff[ iScanPos ] - pdCostSig[ iScanPos ];
2371	rdStats.d64UncodedDist += pdCostCoeff0[ iScanPos ];
2372	if ( iScanPosinCG != 0 )
2373	{
2374	rdStats.iNNZbeforePos0++;
2375	}
2376	}
2377	} //end for (iScanPosinCG)
2378
2379	if (iCGLastScanPos >= 0)
2380	{
2381	if( iCGScanPos )
2382	{
2383	if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
2384	{
2385	UInt uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
2386	d64BaseCost += xGetRateSigCoeffGroup(0, uiCtxSig) - rdStats.d64SigCost;;
2387	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
2388	}
2389	else
2390	{
2391	if (iCGScanPos < iCGLastScanPos) //skip the last coefficient group, which will be handled together with last position below.
2392	{
2393	if ( rdStats.iNNZbeforePos0 == 0 )
2394	{
2395	d64BaseCost -= rdStats.d64SigCost_0;
2396	rdStats.d64SigCost -= rdStats.d64SigCost_0;
2397	}
2398	// rd-cost if SigCoeffGroupFlag = 0, initialization
2399	Double d64CostZeroCG = d64BaseCost;
2400
2401	// add SigCoeffGroupFlag cost to total cost
2402	UInt uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
2403
2404	if (iCGScanPos < iCGLastScanPos)
2405	{
2406	d64BaseCost += xGetRateSigCoeffGroup(1, uiCtxSig);
2407	d64CostZeroCG += xGetRateSigCoeffGroup(0, uiCtxSig);
2408	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(1, uiCtxSig);
2409	}
2410
2411	// try to convert the current coeff group from non-zero to all-zero
2412	d64CostZeroCG += rdStats.d64UncodedDist; // distortion for resetting non-zero levels to zero levels
2413	d64CostZeroCG -= rdStats.d64CodedLevelandDist; // distortion and level cost for keeping all non-zero levels
2414	d64CostZeroCG -= rdStats.d64SigCost; // sig cost for all coeffs, including zero levels and non-zerl levels
2415
2416	// if we can save cost, change this block to all-zero block
2417	if ( d64CostZeroCG < d64BaseCost )
2418	{
2419	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 0;
2420	d64BaseCost = d64CostZeroCG;
2421	if (iCGScanPos < iCGLastScanPos)
2422	{
2423	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
2424	}
2425	// reset coeffs to 0 in this block
2426	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
2427	{
2428	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
2429	UInt uiBlkPos = codingParameters.scan[ iScanPos ];
2430
2431	if (piDstCoeff[ uiBlkPos ])
2432	{
2433	piDstCoeff [ uiBlkPos ] = 0;
2434	pdCostCoeff[ iScanPos ] = pdCostCoeff0[ iScanPos ];
2435	pdCostSig [ iScanPos ] = 0;
2436	}
2437	}
2438	} // end if ( d64CostAllZeros < d64BaseCost )
2439	}
2440	} // end if if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
2441	}
2442	else
2443	{
2444	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 1;
2445	}
2446	}
2447	} //end for (iCGScanPos)
2448
2449	//===== estimate last position =====
2450	if ( iLastScanPos < 0 )
2451	{
2452	return;
2453	}
2454
2455	Double d64BestCost = 0;
2456	Int ui16CtxCbf = 0;
2457	Int iBestLastIdxP1 = 0;
2458	if( !pcCU->isIntra( uiAbsPartIdx ) && isLuma(compID) && pcCU->getTransformIdx( uiAbsPartIdx ) == 0 )
2459	{
2460	ui16CtxCbf = 0;
2461	d64BestCost = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockRootCbpBits[ ui16CtxCbf ][ 0 ] );
2462	d64BaseCost += xGetICost( m_pcEstBitsSbac->blockRootCbpBits[ ui16CtxCbf ][ 1 ] );
2463	}
2464	else
2465	{
2466	ui16CtxCbf = pcCU->getCtxQtCbf( rTu, channelType );
2467	ui16CtxCbf += getCBFContextOffset(compID);
2468	d64BestCost = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 0 ] );
2469	d64BaseCost += xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 1 ] );
2470	}
2471
2472
2473	Bool bFoundLast = false;
2474	for (Int iCGScanPos = iCGLastScanPos; iCGScanPos >= 0; iCGScanPos--)
2475	{
2476	UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
2477
2478	d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ];
2479	if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
2480	{
2481	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
2482	{
2483	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
2484
2485	if (iScanPos > iLastScanPos)
2486	{
2487	continue;
2488	}
2489	UInt uiBlkPos = codingParameters.scan[iScanPos];
2490
2491	if( piDstCoeff[ uiBlkPos ] )
2492	{
2493	UInt uiPosY = uiBlkPos >> uiLog2BlockWidth;
2494	UInt uiPosX = uiBlkPos - ( uiPosY << uiLog2BlockWidth );
2495
2496	Double d64CostLast= codingParameters.scanType == SCAN_VER ? xGetRateLast( uiPosY, uiPosX, compID ) : xGetRateLast( uiPosX, uiPosY, compID );
2497	Double totalCost = d64BaseCost + d64CostLast - pdCostSig[ iScanPos ];
2498
2499	if( totalCost < d64BestCost )
2500	{
2501	iBestLastIdxP1 = iScanPos + 1;
2502	d64BestCost = totalCost;
2503	}
2504	if( piDstCoeff[ uiBlkPos ] > 1 )
2505	{
2506	bFoundLast = true;
2507	break;
2508	}
2509	d64BaseCost -= pdCostCoeff[ iScanPos ];
2510	d64BaseCost += pdCostCoeff0[ iScanPos ];
2511	}
2512	else
2513	{
2514	d64BaseCost -= pdCostSig[ iScanPos ];
2515	}
2516	} //end for
2517	if (bFoundLast)
2518	{
2519	break;
2520	}
2521	} // end if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
2522	} // end for
2523
2524
2525	for ( Int scanPos = 0; scanPos < iBestLastIdxP1; scanPos++ )
2526	{
2527	Int blkPos = codingParameters.scan[ scanPos ];
2528	TCoeff level = piDstCoeff[ blkPos ];
2529	uiAbsSum += level;
2530	piDstCoeff[ blkPos ] = ( plSrcCoeff[ blkPos ] < 0 ) ? -level : level;
2531	}
2532
2533	//===== clean uncoded coefficients =====
2534	for ( Int scanPos = iBestLastIdxP1; scanPos <= iLastScanPos; scanPos++ )
2535	{
2536	piDstCoeff[ codingParameters.scan[ scanPos ] ] = 0;
2537	}
2538
2539
2540	if( pcCU->getSlice()->getPPS()->getSignHideFlag() && uiAbsSum>=2)
2541	{
2542	const Double inverseQuantScale = Double(g_invQuantScales[cQP.rem]);
2543	Int64 rdFactor = (Int64)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per))
2544	/ m_dLambda / 16 / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth - 8)))
2545	+ 0.5);
2546
2547	Int lastCG = -1;
2548	Int absSum = 0 ;
2549	Int n ;
2550
2551	for( Int subSet = (uiWidth*uiHeight-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
2552	{
2553	Int subPos = subSet << MLS_CG_SIZE;
2554	Int firstNZPosInCG=uiCGSize , lastNZPosInCG=-1 ;
2555	absSum = 0 ;
2556
2557	for(n = uiCGSize-1; n >= 0; --n )
2558	{
2559	if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
2560	{
2561	lastNZPosInCG = n;
2562	break;
2563	}
2564	}
2565
2566	for(n = 0; n <uiCGSize; n++ )
2567	{
2568	if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
2569	{
2570	firstNZPosInCG = n;
2571	break;
2572	}
2573	}
2574
2575	for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
2576	{
2577	absSum += Int(piDstCoeff[ codingParameters.scan[ n + subPos ]]);
2578	}
2579
2580	if(lastNZPosInCG>=0 && lastCG==-1)
2581	{
2582	lastCG = 1;
2583	}
2584
2585	if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
2586	{
2587	UInt signbit = (piDstCoeff[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1);
2588	if( signbit!=(absSum&0x1) ) // hide but need tune
2589	{
2590	// calculate the cost
2591	Int64 minCostInc = std::numeric_limits<Int64>::max(), curCost = std::numeric_limits<Int64>::max();
2592	Int minPos = -1, finalChange = 0, curChange = 0;
2593
2594	for( n = (lastCG==1?lastNZPosInCG:uiCGSize-1) ; n >= 0; --n )
2595	{
2596	UInt uiBlkPos = codingParameters.scan[ n + subPos ];
2597	if(piDstCoeff[ uiBlkPos ] != 0 )
2598	{
2599	Int64 costUp = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos];
2600	Int64 costDown = rdFactor * ( deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos]
2601	- ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);
2602
2603	if(lastCG==1 && lastNZPosInCG==n && abs(piDstCoeff[uiBlkPos])==1)
2604	{
2605	costDown -= (4<<15);
2606	}
2607
2608	if(costUp<costDown)
2609	{
2610	curCost = costUp;
2611	curChange = 1;
2612	}
2613	else
2614	{
2615	curChange = -1;
2616	if(n==firstNZPosInCG && abs(piDstCoeff[uiBlkPos])==1)
2617	{
2618	curCost = std::numeric_limits<Int64>::max();
2619	}
2620	else
2621	{
2622	curCost = costDown;
2623	}
2624	}
2625	}
2626	else
2627	{
2628	curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<15) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ;
2629	curChange = 1 ;
2630
2631	if(n<firstNZPosInCG)
2632	{
2633	UInt thissignbit = (plSrcCoeff[uiBlkPos]>=0?0:1);
2634	if(thissignbit != signbit )
2635	{
2636	curCost = std::numeric_limits<Int64>::max();
2637	}
2638	}
2639	}
2640
2641	if( curCost<minCostInc)
2642	{
2643	minCostInc = curCost;
2644	finalChange = curChange;
2645	minPos = uiBlkPos;
2646	}
2647	}
2648
2649	if(piDstCoeff[minPos] == entropyCodingMaximum \|\| piDstCoeff[minPos] == entropyCodingMinimum)
2650	{
2651	finalChange = -1;
2652	}
2653
2654	if(plSrcCoeff[minPos]>=0)
2655	{
2656	piDstCoeff[minPos] += finalChange ;
2657	}
2658	else
2659	{
2660	piDstCoeff[minPos] -= finalChange ;
2661	}
2662	}
2663	}
2664
2665	if(lastCG==1)
2666	{
2667	lastCG=0 ;
2668	}
2669	}
2670	}
2671	}
2672
2673
2674	/** Pattern decision for context derivation process of significant_coeff_flag
2675	* \param sigCoeffGroupFlag pointer to prior coded significant coeff group
2676	* \param uiCGPosX column of current coefficient group
2677	* \param uiCGPosY row of current coefficient group
2678	* \param widthInGroups width of the block
2679	* \param heightInGroups height of the block
2680	* \returns pattern for current coefficient group
2681	*/
2682	Int TComTrQuant::calcPatternSigCtx( const UInt* sigCoeffGroupFlag, UInt uiCGPosX, UInt uiCGPosY, UInt widthInGroups, UInt heightInGroups )
2683	{
2684	if ((widthInGroups <= 1) && (heightInGroups <= 1))
2685	{
2686	return 0;
2687	}
2688
2689	const Bool rightAvailable = uiCGPosX < (widthInGroups - 1);
2690	const Bool belowAvailable = uiCGPosY < (heightInGroups - 1);
2691
2692	UInt sigRight = 0;
2693	UInt sigLower = 0;
2694
2695	if (rightAvailable)
2696	{
2697	sigRight = ((sigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
2698	}
2699	if (belowAvailable)
2700	{
2701	sigLower = ((sigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
2702	}
2703
2704	return sigRight + (sigLower << 1);
2705	}
2706
2707
2708	/** Context derivation process of coeff_abs_significant_flag
2709	* \param patternSigCtx pattern for current coefficient group
2710	* \param codingParameters coding parameters for the TU (includes the scan)
2711	* \param scanPosition current position in scan order
2712	* \param log2BlockWidth log2 width of the block
2713	* \param log2BlockHeight log2 height of the block
2714	* \param chanType channel type (CHANNEL_TYPE_LUMA/CHROMA)
2715	* \returns ctxInc for current scan position
2716	*/
2717	Int TComTrQuant::getSigCtxInc ( Int patternSigCtx,
2718	const TUEntropyCodingParameters &codingParameters,
2719	const Int scanPosition,
2720	const Int log2BlockWidth,
2721	const Int log2BlockHeight,
2722	const ChannelType chanType)
2723	{
2724	if (codingParameters.firstSignificanceMapContext == significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE])
2725	{
2726	//single context mode
2727	return significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE];
2728	}
2729
2730	const UInt rasterPosition = codingParameters.scan[scanPosition];
2731	const UInt posY = rasterPosition >> log2BlockWidth;
2732	const UInt posX = rasterPosition - (posY << log2BlockWidth);
2733
2734	if ((posX + posY) == 0)
2735	{
2736	return 0; //special case for the DC context variable
2737	}
2738
2739	Int offset = MAX_INT;
2740
2741	if ((log2BlockWidth == 2) && (log2BlockHeight == 2)) //4x4
2742	{
2743	offset = ctxIndMap4x4[ (4 * posY) + posX ];
2744	}
2745	else
2746	{
2747	Int cnt = 0;
2748
2749	switch (patternSigCtx)
2750	{
2751	//------------------
2752
2753	case 0: //neither neighbouring group is significant
2754	{
2755	const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH) - 1);
2756	const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
2757	const Int posTotalInSubset = posXinSubset + posYinSubset;
2758
2759	//first N coefficients in scan order use 2; the next few use 1; the rest use 0.
2760	const UInt context1Threshold = NEIGHBOURHOOD_00_CONTEXT_1_THRESHOLD_4x4;
2761	const UInt context2Threshold = NEIGHBOURHOOD_00_CONTEXT_2_THRESHOLD_4x4;
2762
2763	cnt = (posTotalInSubset >= context1Threshold) ? 0 : ((posTotalInSubset >= context2Threshold) ? 1 : 2);
2764	}
2765	break;
2766
2767	//------------------
2768
2769	case 1: //right group is significant, below is not
2770	{
2771	const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
2772	const Int groupHeight = 1 << MLS_CG_LOG2_HEIGHT;
2773
2774	cnt = (posYinSubset >= (groupHeight >> 1)) ? 0 : ((posYinSubset >= (groupHeight >> 2)) ? 1 : 2); //top quarter uses 2; second-from-top quarter uses 1; bottom half uses 0
2775	}
2776	break;
2777
2778	//------------------
2779
2780	case 2: //below group is significant, right is not
2781	{
2782	const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH) - 1);
2783	const Int groupWidth = 1 << MLS_CG_LOG2_WIDTH;
2784
2785	cnt = (posXinSubset >= (groupWidth >> 1)) ? 0 : ((posXinSubset >= (groupWidth >> 2)) ? 1 : 2); //left quarter uses 2; second-from-left quarter uses 1; right half uses 0
2786	}
2787	break;
2788
2789	//------------------
2790
2791	case 3: //both neighbouring groups are significant
2792	{
2793	cnt = 2;
2794	}
2795	break;
2796
2797	//------------------
2798
2799	default:
2800	std::cerr << "ERROR: Invalid patternSigCtx \"" << Int(patternSigCtx) << "\" in getSigCtxInc" << std::endl;
2801	exit(1);
2802	break;
2803	}
2804
2805	//------------------------------------------------
2806
2807	const Bool notFirstGroup = ((posX >> MLS_CG_LOG2_WIDTH) + (posY >> MLS_CG_LOG2_HEIGHT)) > 0;
2808
2809	offset = (notFirstGroup ? notFirstGroupNeighbourhoodContextOffset[chanType] : 0) + cnt;
2810	}
2811
2812	return codingParameters.firstSignificanceMapContext + offset;
2813	}
2814
2815
2816	/** Get the best level in RD sense
2817	*
2818	* \returns best quantized transform level for given scan position
2819	*
2820	* This method calculates the best quantized transform level for a given scan position.
2821	*/
2822	__inline UInt TComTrQuant::xGetCodedLevel ( Double& rd64CodedCost, //< reference to coded cost
2823	Double& rd64CodedCost0, //< reference to cost when coefficient is 0
2824	Double& rd64CodedCostSig, //< rd64CodedCostSig reference to cost of significant coefficient
2825	Intermediate_Int lLevelDouble, //< reference to unscaled quantized level
2826	UInt uiMaxAbsLevel, //< scaled quantized level
2827	UShort ui16CtxNumSig, //< current ctxInc for coeff_abs_significant_flag
2828	UShort ui16CtxNumOne, //< current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
2829	UShort ui16CtxNumAbs, //< current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
2830	UShort ui16AbsGoRice, //< current Rice parameter for coeff_abs_level_minus3
2831	UInt c1Idx, //<
2832	UInt c2Idx, //<
2833	Int iQBits, //< quantization step size
2834	Double errorScale, //<
2835	Bool bLast, //< indicates if the coefficient is the last significant
2836	Bool useLimitedPrefixLength, //<
2837	const Int maxLog2TrDynamicRange //<
2838	) const
2839	{
2840	Double dCurrCostSig = 0;
2841	UInt uiBestAbsLevel = 0;
2842
2843	if( !bLast && uiMaxAbsLevel < 3 )
2844	{
2845	rd64CodedCostSig = xGetRateSigCoef( 0, ui16CtxNumSig );
2846	rd64CodedCost = rd64CodedCost0 + rd64CodedCostSig;
2847	if( uiMaxAbsLevel == 0 )
2848	{
2849	return uiBestAbsLevel;
2850	}
2851	}
2852	else
2853	{
2854	rd64CodedCost = MAX_DOUBLE;
2855	}
2856
2857	if( !bLast )
2858	{
2859	dCurrCostSig = xGetRateSigCoef( 1, ui16CtxNumSig );
2860	}
2861
2862	UInt uiMinAbsLevel = ( uiMaxAbsLevel > 1 ? uiMaxAbsLevel - 1 : 1 );
2863	for( Int uiAbsLevel = uiMaxAbsLevel; uiAbsLevel >= uiMinAbsLevel ; uiAbsLevel-- )
2864	{
2865	Double dErr = Double( lLevelDouble - ( Intermediate_Int(uiAbsLevel) << iQBits ) );
2866	Double dCurrCost = dErr * dErr * errorScale + xGetICost( xGetICRate( uiAbsLevel, ui16CtxNumOne, ui16CtxNumAbs, ui16AbsGoRice, c1Idx, c2Idx, useLimitedPrefixLength, maxLog2TrDynamicRange ) );
2867	dCurrCost += dCurrCostSig;
2868
2869	if( dCurrCost < rd64CodedCost )
2870	{
2871	uiBestAbsLevel = uiAbsLevel;
2872	rd64CodedCost = dCurrCost;
2873	rd64CodedCostSig = dCurrCostSig;
2874	}
2875	}
2876
2877	return uiBestAbsLevel;
2878	}
2879
2880	/** Calculates the cost for specific absolute transform level
2881	* \param uiAbsLevel scaled quantized level
2882	* \param ui16CtxNumOne current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
2883	* \param ui16CtxNumAbs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
2884	* \param ui16AbsGoRice Rice parameter for coeff_abs_level_minus3
2885	* \param c1Idx
2886	* \param c2Idx
2887	* \param useLimitedPrefixLength
2888	* \param maxLog2TrDynamicRange
2889	* \returns cost of given absolute transform level
2890	*/
2891	__inline Int TComTrQuant::xGetICRate ( const UInt uiAbsLevel,
2892	const UShort ui16CtxNumOne,
2893	const UShort ui16CtxNumAbs,
2894	const UShort ui16AbsGoRice,
2895	const UInt c1Idx,
2896	const UInt c2Idx,
2897	const Bool useLimitedPrefixLength,
2898	const Int maxLog2TrDynamicRange
2899	) const
2900	{
2901	Int iRate = Int(xGetIEPRate()); // cost of sign bit
2902	UInt baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
2903
2904	if ( uiAbsLevel >= baseLevel )
2905	{
2906	UInt symbol = uiAbsLevel - baseLevel;
2907	UInt length;
2908	if (symbol < (COEF_REMAIN_BIN_REDUCTION << ui16AbsGoRice))
2909	{
2910	length = symbol>>ui16AbsGoRice;
2911	iRate += (length+1+ui16AbsGoRice)<< 15;
2912	}
2913	else if (useLimitedPrefixLength)
2914	{
2915	const UInt maximumPrefixLength = (32 - (COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange));
2916
2917	UInt prefixLength = 0;
2918	UInt suffix = (symbol >> ui16AbsGoRice) - COEF_REMAIN_BIN_REDUCTION;
2919
2920	while ((prefixLength < maximumPrefixLength) && (suffix > ((2 << prefixLength) - 2)))
2921	{
2922	prefixLength++;
2923	}
2924
2925	const UInt suffixLength = (prefixLength == maximumPrefixLength) ? (maxLog2TrDynamicRange - ui16AbsGoRice) : (prefixLength + 1/separator/);
2926
2927	iRate += (COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ui16AbsGoRice) << 15;
2928	}
2929	else
2930	{
2931	length = ui16AbsGoRice;
2932	symbol = symbol - ( COEF_REMAIN_BIN_REDUCTION << ui16AbsGoRice);
2933	while (symbol >= (1<<length))
2934	{
2935	symbol -= (1<<(length++));
2936	}
2937	iRate += (COEF_REMAIN_BIN_REDUCTION+length+1-ui16AbsGoRice+length)<< 15;
2938	}
2939
2940	if (c1Idx < C1FLAG_NUMBER)
2941	{
2942	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 1 ];
2943
2944	if (c2Idx < C2FLAG_NUMBER)
2945	{
2946	iRate += m_pcEstBitsSbac->m_levelAbsBits[ ui16CtxNumAbs ][ 1 ];
2947	}
2948	}
2949	}
2950	else if( uiAbsLevel == 1 )
2951	{
2952	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 0 ];
2953	}
2954	else if( uiAbsLevel == 2 )
2955	{
2956	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 1 ];
2957	iRate += m_pcEstBitsSbac->m_levelAbsBits[ ui16CtxNumAbs ][ 0 ];
2958	}
2959	else
2960	{
2961	iRate = 0;
2962	}
2963
2964	return iRate;
2965	}
2966
2967	__inline Double TComTrQuant::xGetRateSigCoeffGroup ( UShort uiSignificanceCoeffGroup,
2968	UShort ui16CtxNumSig ) const
2969	{
2970	return xGetICost( m_pcEstBitsSbac->significantCoeffGroupBits[ ui16CtxNumSig ][ uiSignificanceCoeffGroup ] );
2971	}
2972
2973	/** Calculates the cost of signaling the last significant coefficient in the block
2974	* \param uiPosX X coordinate of the last significant coefficient
2975	* \param uiPosY Y coordinate of the last significant coefficient
2976	* \param component colour component ID
2977	* \returns cost of last significant coefficient
2978	*/
2979	/*
2980	* \param uiWidth width of the transform unit (TU)
2981	*/
2982	__inline Double TComTrQuant::xGetRateLast ( const UInt uiPosX,
2983	const UInt uiPosY,
2984	const ComponentID component ) const
2985	{
2986	UInt uiCtxX = g_uiGroupIdx[uiPosX];
2987	UInt uiCtxY = g_uiGroupIdx[uiPosY];
2988
2989	Double uiCost = m_pcEstBitsSbac->lastXBits[toChannelType(component)][ uiCtxX ] + m_pcEstBitsSbac->lastYBits[toChannelType(component)][ uiCtxY ];
2990
2991	if( uiCtxX > 3 )
2992	{
2993	uiCost += xGetIEPRate() * ((uiCtxX-2)>>1);
2994	}
2995	if( uiCtxY > 3 )
2996	{
2997	uiCost += xGetIEPRate() * ((uiCtxY-2)>>1);
2998	}
2999	return xGetICost( uiCost );
3000	}
3001
3002	__inline Double TComTrQuant::xGetRateSigCoef ( UShort uiSignificance,
3003	UShort ui16CtxNumSig ) const
3004	{
3005	return xGetICost( m_pcEstBitsSbac->significantBits[ ui16CtxNumSig ][ uiSignificance ] );
3006	}
3007
3008	/** Get the cost for a specific rate
3009	* \param dRate rate of a bit
3010	* \returns cost at the specific rate
3011	*/
3012	__inline Double TComTrQuant::xGetICost ( Double dRate ) const
3013	{
3014	return m_dLambda * dRate;
3015	}
3016
3017	/** Get the cost of an equal probable bit
3018	* \returns cost of equal probable bit
3019	*/
3020	__inline Double TComTrQuant::xGetIEPRate ( ) const
3021	{
3022	return 32768;
3023	}
3024
3025	/** Context derivation process of coeff_abs_significant_flag
3026	* \param uiSigCoeffGroupFlag significance map of L1
3027	* \param uiCGPosX column of current scan position
3028	* \param uiCGPosY row of current scan position
3029	* \param widthInGroups width of the block
3030	* \param heightInGroups height of the block
3031	* \returns ctxInc for current scan position
3032	*/
3033	UInt TComTrQuant::getSigCoeffGroupCtxInc (const UInt* uiSigCoeffGroupFlag,
3034	const UInt uiCGPosX,
3035	const UInt uiCGPosY,
3036	const UInt widthInGroups,
3037	const UInt heightInGroups)
3038	{
3039	UInt sigRight = 0;
3040	UInt sigLower = 0;
3041
3042	if (uiCGPosX < (widthInGroups - 1))
3043	{
3044	sigRight = ((uiSigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
3045	}
3046	if (uiCGPosY < (heightInGroups - 1))
3047	{
3048	sigLower = ((uiSigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
3049	}
3050
3051	return ((sigRight + sigLower) != 0) ? 1 : 0;
3052	}
3053
3054
3055	/** set quantized matrix coefficient for encode
3056	* \param scalingList quantized matrix address
3057	* \param format chroma format
3058	* \param maxLog2TrDynamicRange
3059	* \param bitDepths reference to bit depth array for all channels
3060	*/
3061	Void TComTrQuant::setScalingList(TComScalingList *scalingList, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
3062	{
3063	const Int minimumQp = 0;
3064	const Int maximumQp = SCALING_LIST_REM_NUM;
3065
3066	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
3067	{
3068	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
3069	{
3070	for(Int qp = minimumQp; qp < maximumQp; qp++)
3071	{
3072	xSetScalingListEnc(scalingList,list,size,qp);
3073	xSetScalingListDec(*scalingList,list,size,qp);
3074	setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
3075	}
3076	}
3077	}
3078	}
3079	/** set quantized matrix coefficient for decode
3080	* \param scalingList quantized matrix address
3081	* \param format chroma format
3082	*/
3083	Void TComTrQuant::setScalingListDec(const TComScalingList &scalingList)
3084	{
3085	const Int minimumQp = 0;
3086	const Int maximumQp = SCALING_LIST_REM_NUM;
3087
3088	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
3089	{
3090	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
3091	{
3092	for(Int qp = minimumQp; qp < maximumQp; qp++)
3093	{
3094	xSetScalingListDec(scalingList,list,size,qp);
3095	}
3096	}
3097	}
3098	}
3099	/** set error scale coefficients
3100	* \param list list ID
3101	* \param size
3102	* \param qp quantization parameter
3103	* \param maxLog2TrDynamicRange
3104	* \param bitDepths reference to bit depth array for all channels
3105	*/
3106	Void TComTrQuant::setErrScaleCoeff(UInt list, UInt size, Int qp, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
3107	{
3108	const UInt uiLog2TrSize = g_aucConvertToBit[ g_scalingListSizeX[size] ] + 2;
3109	const ChannelType channelType = ((list == 0) \|\| (list == MAX_NUM_COMPONENT)) ? CHANNEL_TYPE_LUMA : CHANNEL_TYPE_CHROMA;
3110
3111	const Int channelBitDepth = bitDepths.recon[channelType];
3112	const Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange[channelType]); // Represents scaling through forward transform
3113
3114	UInt i,uiMaxNumCoeff = g_scalingListSize[size];
3115	Int *piQuantcoeff;
3116	Double *pdErrScale;
3117	piQuantcoeff = getQuantCoeff(list, qp,size);
3118	pdErrScale = getErrScaleCoeff(list, size, qp);
3119
3120	Double dErrScale = (Double)(1<<SCALE_BITS); // Compensate for scaling of bitcount in Lagrange cost function
3121	dErrScale = dErrScalepow(2.0,(-2.0iTransformShift)); // Compensate for scaling through forward transform
3122
3123	for(i=0;i<uiMaxNumCoeff;i++)
3124	{
3125	pdErrScale[i] = dErrScale / piQuantcoeff[i] / piQuantcoeff[i] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
3126	}
3127
3128	getErrScaleCoeffNoScalingList(list, size, qp) = dErrScale / g_quantScales[qp] / g_quantScales[qp] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
3129	}
3130
3131	/** set quantized matrix coefficient for encode
3132	* \param scalingList quantized matrix address
3133	* \param listId List index
3134	* \param sizeId size index
3135	* \param qp Quantization parameter
3136	* \param format chroma format
3137	*/
3138	Void TComTrQuant::xSetScalingListEnc(TComScalingList *scalingList, UInt listId, UInt sizeId, Int qp)
3139	{
3140	UInt width = g_scalingListSizeX[sizeId];
3141	UInt height = g_scalingListSizeX[sizeId];
3142	UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
3143	Int *quantcoeff;
3144	Int *coeff = scalingList->getScalingListAddress(sizeId,listId);
3145	quantcoeff = getQuantCoeff(listId, qp, sizeId);
3146
3147	Int quantScales = g_quantScales[qp];
3148
3149	processScalingListEnc(coeff,
3150	quantcoeff,
3151	(quantScales << LOG2_SCALING_LIST_NEUTRAL_VALUE),
3152	height, width, ratio,
3153	min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
3154	scalingList->getScalingListDC(sizeId,listId));
3155	}
3156
3157	/** set quantized matrix coefficient for decode
3158	* \param scalingList quantaized matrix address
3159	* \param listId List index
3160	* \param sizeId size index
3161	* \param qp Quantization parameter
3162	* \param format chroma format
3163	*/
3164	Void TComTrQuant::xSetScalingListDec(const TComScalingList &scalingList, UInt listId, UInt sizeId, Int qp)
3165	{
3166	UInt width = g_scalingListSizeX[sizeId];
3167	UInt height = g_scalingListSizeX[sizeId];
3168	UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
3169	Int *dequantcoeff;
3170	const Int *coeff = scalingList.getScalingListAddress(sizeId,listId);
3171
3172	dequantcoeff = getDequantCoeff(listId, qp, sizeId);
3173
3174	Int invQuantScale = g_invQuantScales[qp];
3175
3176	processScalingListDec(coeff,
3177	dequantcoeff,
3178	invQuantScale,
3179	height, width, ratio,
3180	min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
3181	scalingList.getScalingListDC(sizeId,listId));
3182	}
3183
3184	/** set flat matrix value to quantized coefficient
3185	*/
3186	Void TComTrQuant::setFlatScalingList(const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
3187	{
3188	const Int minimumQp = 0;
3189	const Int maximumQp = SCALING_LIST_REM_NUM;
3190
3191	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
3192	{
3193	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
3194	{
3195	for(Int qp = minimumQp; qp < maximumQp; qp++)
3196	{
3197	xsetFlatScalingList(list,size,qp);
3198	setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
3199	}
3200	}
3201	}
3202	}
3203
3204	/** set flat matrix value to quantized coefficient
3205	* \param list List ID
3206	* \param size size index
3207	* \param qp Quantization parameter
3208	* \param format chroma format
3209	*/
3210	Void TComTrQuant::xsetFlatScalingList(UInt list, UInt size, Int qp)
3211	{
3212	UInt i,num = g_scalingListSize[size];
3213	Int *quantcoeff;
3214	Int *dequantcoeff;
3215
3216	Int quantScales = g_quantScales [qp];
3217	Int invQuantScales = g_invQuantScales[qp] << 4;
3218
3219	quantcoeff = getQuantCoeff(list, qp, size);
3220	dequantcoeff = getDequantCoeff(list, qp, size);
3221
3222	for(i=0;i<num;i++)
3223	{
3224	*quantcoeff++ = quantScales;
3225	*dequantcoeff++ = invQuantScales;
3226	}
3227	}
3228
3229	/** set quantized matrix coefficient for encode
3230	* \param coeff quantaized matrix address
3231	* \param quantcoeff quantaized matrix address
3232	* \param quantScales Q(QP%6)
3233	* \param height height
3234	* \param width width
3235	* \param ratio ratio for upscale
3236	* \param sizuNum matrix size
3237	* \param dc dc parameter
3238	*/
3239	Void TComTrQuant::processScalingListEnc( Int coeff, Int quantcoeff, Int quantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
3240	{
3241	for(UInt j=0;j<height;j++)
3242	{
3243	for(UInt i=0;i<width;i++)
3244	{
3245	quantcoeff[jwidth + i] = quantScales / coeff[sizuNum (j / ratio) + i / ratio];
3246	}
3247	}
3248
3249	if(ratio > 1)
3250	{
3251	quantcoeff[0] = quantScales / dc;
3252	}
3253	}
3254
3255	/** set quantized matrix coefficient for decode
3256	* \param coeff quantaized matrix address
3257	* \param dequantcoeff quantaized matrix address
3258	* \param invQuantScales IQ(QP%6))
3259	* \param height height
3260	* \param width width
3261	* \param ratio ratio for upscale
3262	* \param sizuNum matrix size
3263	* \param dc dc parameter
3264	*/
3265	Void TComTrQuant::processScalingListDec( const Int coeff, Int dequantcoeff, Int invQuantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
3266	{
3267	for(UInt j=0;j<height;j++)
3268	{
3269	for(UInt i=0;i<width;i++)
3270	{
3271	dequantcoeff[jwidth + i] = invQuantScales coeff[sizuNum * (j / ratio) + i / ratio];
3272	}
3273	}
3274
3275	if(ratio > 1)
3276	{
3277	dequantcoeff[0] = invQuantScales * dc;
3278	}
3279	}
3280
3281	/** initialization process of scaling list array
3282	*/
3283	Void TComTrQuant::initScalingList()
3284	{
3285	for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
3286	{
3287	for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
3288	{
3289	for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
3290	{
3291	m_quantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
3292	m_dequantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
3293	m_errScale [sizeId][listId][qp] = new Double [g_scalingListSize[sizeId]];
3294	} // listID loop
3295	}
3296	}
3297	}
3298
3299	/** destroy quantization matrix array
3300	*/
3301	Void TComTrQuant::destroyScalingList()
3302	{
3303	for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
3304	{
3305	for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
3306	{
3307	for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
3308	{
3309	if(m_quantCoef[sizeId][listId][qp])
3310	{
3311	delete [] m_quantCoef[sizeId][listId][qp];
3312	}
3313	if(m_dequantCoef[sizeId][listId][qp])
3314	{
3315	delete [] m_dequantCoef[sizeId][listId][qp];
3316	}
3317	if(m_errScale[sizeId][listId][qp])
3318	{
3319	delete [] m_errScale[sizeId][listId][qp];
3320	}
3321	}
3322	}
3323	}
3324	}
3325
3326	Void TComTrQuant::transformSkipQuantOneSample(TComTU &rTu, const ComponentID compID, const TCoeff resiDiff, TCoeff* pcCoeff, const UInt uiPos, const QpParam &cQP, const Bool bUseHalfRoundingPoint)
3327	{
3328	TComDataCU *pcCU = rTu.getCU();
3329	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
3330	const TComRectangle &rect = rTu.getRect(compID);
3331	const UInt uiWidth = rect.width;
3332	const UInt uiHeight = rect.height;
3333	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
3334	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
3335	const Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
3336	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
3337	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
3338	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
3339
3340	assert( scalingListType < SCALING_LIST_NUM );
3341	const Int *const piQuantCoeff = getQuantCoeff( scalingListType, cQP.rem, (rTu.GetEquivalentLog2TrSize(compID)-2) );
3342
3343
3344	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
3345	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
3346	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
3347	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
3348	*/
3349
3350	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
3351	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
3352
3353	const Int iAdd = ( bUseHalfRoundingPoint ? 256 : (pcCU->getSlice()->getSliceType() == I_SLICE ? 171 : 85) ) << (iQBits - 9);
3354
3355	TCoeff transformedCoefficient;
3356
3357	// transform-skip
3358	if (iTransformShift >= 0)
3359	{
3360	transformedCoefficient = resiDiff << iTransformShift;
3361	}
3362	else // for very high bit depths
3363	{
3364	const Int iTrShiftNeg = -iTransformShift;
3365	const Int offset = 1 << (iTrShiftNeg - 1);
3366	transformedCoefficient = ( resiDiff + offset ) >> iTrShiftNeg;
3367	}
3368
3369	// quantization
3370	const TCoeff iSign = (transformedCoefficient < 0 ? -1: 1);
3371
3372	const Int quantisationCoefficient = enableScalingLists ? piQuantCoeff[uiPos] : defaultQuantisationCoefficient;
3373
3374	const Int64 tmpLevel = (Int64)abs(transformedCoefficient) * quantisationCoefficient;
3375
3376	const TCoeff quantisedCoefficient = (TCoeff((tmpLevel + iAdd ) >> iQBits)) * iSign;
3377
3378	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
3379	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
3380	pcCoeff[ uiPos ] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
3381	}
3382
3383
3384	Void TComTrQuant::invTrSkipDeQuantOneSample( TComTU &rTu, ComponentID compID, TCoeff inSample, Pel &reconSample, const QpParam &cQP, UInt uiPos )
3385	{
3386	TComDataCU *pcCU = rTu.getCU();
3387	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
3388	const TComRectangle &rect = rTu.getRect(compID);
3389	const UInt uiWidth = rect.width;
3390	const UInt uiHeight = rect.height;
3391	const Int QP_per = cQP.per;
3392	const Int QP_rem = cQP.rem;
3393	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
3394	#if O0043_BEST_EFFORT_DECODING
3395	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
3396	#else
3397	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
3398	#endif
3399	const Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
3400	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
3401	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
3402	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
3403
3404	assert( scalingListType < SCALING_LIST_NUM );
3405
3406	const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
3407
3408	const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
3409	const TCoeff transformMaximum = (1 << maxLog2TrDynamicRange) - 1;
3410
3411	// Dequantisation
3412
3413	TCoeff dequantisedSample;
3414
3415	if(enableScalingLists)
3416	{
3417	const UInt dequantCoefBits = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
3418	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
3419
3420	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
3421	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
3422
3423	Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
3424
3425	if(rightShift > 0)
3426	{
3427	const Intermediate_Int iAdd = 1 << (rightShift - 1);
3428	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
3429	const Intermediate_Int iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) + iAdd ) >> rightShift;
3430
3431	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
3432	}
3433	else
3434	{
3435	const Int leftShift = -rightShift;
3436	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
3437	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) << leftShift;
3438
3439	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
3440	}
3441	}
3442	else
3443	{
3444	const Int scale = g_invQuantScales[QP_rem];
3445	const Int scaleBits = (IQUANT_SHIFT + 1) ;
3446
3447	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
3448	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
3449	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
3450
3451	if (rightShift > 0)
3452	{
3453	const Intermediate_Int iAdd = 1 << (rightShift - 1);
3454	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
3455	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
3456
3457	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
3458	}
3459	else
3460	{
3461	const Int leftShift = -rightShift;
3462	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
3463	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale) << leftShift;
3464
3465	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
3466	}
3467	}
3468
3469	// Inverse transform-skip
3470
3471	if (iTransformShift >= 0)
3472	{
3473	const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
3474	reconSample = Pel(( dequantisedSample + offset ) >> iTransformShift);
3475	}
3476	else //for very high bit depths
3477	{
3478	const Int iTrShiftNeg = -iTransformShift;
3479	reconSample = Pel(dequantisedSample << iTrShiftNeg);
3480	}
3481	}
3482
3483
3484	Void TComTrQuant::crossComponentPrediction( TComTU & rTu,
3485	const ComponentID compID,
3486	const Pel * piResiL,
3487	const Pel * piResiC,
3488	Pel * piResiT,
3489	const Int width,
3490	const Int height,
3491	const Int strideL,
3492	const Int strideC,
3493	const Int strideT,
3494	const Bool reverse )
3495	{
3496	const Pel *pResiL = piResiL;
3497	const Pel *pResiC = piResiC;
3498	Pel *pResiT = piResiT;
3499
3500	TComDataCU *pCU = rTu.getCU();
3501	const Int alpha = pCU->getCrossComponentPredictionAlpha( rTu.GetAbsPartIdxTU( compID ), compID );
3502	const Int diffBitDepth = pCU->getSlice()->getSPS()->getDifferentialLumaChromaBitDepth();
3503
3504	for( Int y = 0; y < height; y++ )
3505	{
3506	if (reverse)
3507	{
3508	// A constraint is to be added to the HEVC Standard to limit the size of pResiL and pResiC at this point.
3509	// The likely form of the constraint is to either restrict the values to CoeffMin to CoeffMax,
3510	// or to be representable in a bitDepthY+4 or bitDepthC+4 signed integer.
3511	// The result of the constraint is that for 8/10/12bit profiles, the input values
3512	// can be represented within a 16-bit Pel-type.
3513	#if RExt__HIGH_BIT_DEPTH_SUPPORT
3514	for( Int x = 0; x < width; x++ )
3515	{
3516	pResiT[x] = pResiC[x] + (( alpha * rightShift( pResiL[x], diffBitDepth) ) >> 3);
3517	}
3518	#else
3519	const Int minPel=std::numeric_limits<Pel>::min();
3520	const Int maxPel=std::numeric_limits<Pel>::max();
3521	for( Int x = 0; x < width; x++ )
3522	{
3523	pResiT[x] = Clip3<Int>(minPel, maxPel, pResiC[x] + (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3));
3524	}
3525	#endif
3526	}
3527	else
3528	{
3529	// Forward does not need clipping. Pel type should always be big enough.
3530	for( Int x = 0; x < width; x++ )
3531	{
3532	pResiT[x] = pResiC[x] - (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3);
3533	}
3534	}
3535
3536	pResiL += strideL;
3537	pResiC += strideC;
3538	pResiT += strideT;
3539	}
3540	}
3541
3542	//! \}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: