Context navigation

source: SHVCSoftware/branches/SHM-dev/source/Lib/TLibCommon/TComTrQuant.cpp @ 1312

Visit:

Last change on this file since 1312 was 1307, checked in by seregin, 9 years ago
port rev 4363
Property svn:eol-style set to `native`
File size: 131.3 KB

Line
1	/* The copyright in this software is being made available under the BSD
2	* License, included below. This software may be subject to other third party
3	* and contributor rights, including patent rights, and no such rights are
4	* granted under this license.
5	*
6	* Copyright (c) 2010-2015, ITU/ISO/IEC
7	* All rights reserved.
8	*
9	* Redistribution and use in source and binary forms, with or without
10	* modification, are permitted provided that the following conditions are met:
11	*
12	* * Redistributions of source code must retain the above copyright notice,
13	* this list of conditions and the following disclaimer.
14	* * Redistributions in binary form must reproduce the above copyright notice,
15	* this list of conditions and the following disclaimer in the documentation
16	* and/or other materials provided with the distribution.
17	* * Neither the name of the ITU/ISO/IEC nor the names of its contributors may
18	* be used to endorse or promote products derived from this software without
19	* specific prior written permission.
20	*
21	* THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS"
22	* AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE
23	* IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE
24	* ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS
25	* BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR
26	* CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF
27	* SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS
28	* INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN
29	* CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE)
30	* ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF
31	* THE POSSIBILITY OF SUCH DAMAGE.
32	*/
33
34	/** \file TComTrQuant.cpp
35	\brief transform and quantization class
36	*/
37
38	#include <stdlib.h>
39	#include <math.h>
40	#include <limits>
41	#include <memory.h>
42	#include "TComTrQuant.h"
43	#include "TComPic.h"
44	#include "ContextTables.h"
45	#include "TComTU.h"
46	#include "Debug.h"
47
48	typedef struct
49	{
50	Int iNNZbeforePos0;
51	Double d64CodedLevelandDist; // distortion and level cost only
52	Double d64UncodedDist; // all zero coded block distortion
53	Double d64SigCost;
54	Double d64SigCost_0;
55	} coeffGroupRDStats;
56
57	//! \ingroup TLibCommon
58	//! \{
59
60	// ====================================================================================================================
61	// Constants
62	// ====================================================================================================================
63
64	#define RDOQ_CHROMA 1 ///< use of RDOQ in chroma
65
66
67	// ====================================================================================================================
68	// QpParam constructor
69	// ====================================================================================================================
70
71	QpParam::QpParam(const Int qpy,
72	const ChannelType chType,
73	const Int qpBdOffset,
74	const Int chromaQPOffset,
75	const ChromaFormat chFmt )
76	{
77	Int baseQp;
78
79	if(isLuma(chType))
80	{
81	baseQp = qpy + qpBdOffset;
82	}
83	else
84	{
85	baseQp = Clip3( -qpBdOffset, (chromaQPMappingTableSize - 1), qpy + chromaQPOffset );
86
87	if(baseQp < 0)
88	{
89	baseQp = baseQp + qpBdOffset;
90	}
91	else
92	{
93	baseQp = getScaledChromaQP(baseQp, chFmt) + qpBdOffset;
94	}
95	}
96
97	Qp =baseQp;
98	per=baseQp/6;
99	rem=baseQp%6;
100	}
101
102	QpParam::QpParam(const TComDataCU &cu, const ComponentID compID)
103	{
104	Int chromaQpOffset = 0;
105
106	if (isChroma(compID))
107	{
108	chromaQpOffset += cu.getSlice()->getPPS()->getQpOffset(compID);
109	chromaQpOffset += cu.getSlice()->getSliceChromaQpDelta(compID);
110
111	chromaQpOffset += cu.getSlice()->getPPS()->getChromaQpAdjTableAt(cu.getChromaQpAdj(0)).u.offset[Int(compID)-1];
112	}
113
114	*this = QpParam(cu.getQP( 0 ),
115	toChannelType(compID),
116	#if SVC_EXTENSION
117	cu.getSlice()->getQpBDOffset(toChannelType(compID)),
118	#else
119	cu.getSlice()->getSPS()->getQpBDOffset(toChannelType(compID)),
120	#endif
121	chromaQpOffset,
122	cu.getPic()->getChromaFormat());
123	}
124
125
126	// ====================================================================================================================
127	// TComTrQuant class member functions
128	// ====================================================================================================================
129
130	TComTrQuant::TComTrQuant()
131	{
132	// allocate temporary buffers
133	m_plTempCoeff = new TCoeff[ MAX_CU_SIZE*MAX_CU_SIZE ];
134
135	// allocate bit estimation class (for RDOQ)
136	m_pcEstBitsSbac = new estBitsSbacStruct;
137	initScalingList();
138	}
139
140	TComTrQuant::~TComTrQuant()
141	{
142	// delete temporary buffers
143	if ( m_plTempCoeff )
144	{
145	delete [] m_plTempCoeff;
146	m_plTempCoeff = NULL;
147	}
148
149	// delete bit estimation class
150	if ( m_pcEstBitsSbac )
151	{
152	delete m_pcEstBitsSbac;
153	}
154	destroyScalingList();
155	}
156
157	#if ADAPTIVE_QP_SELECTION
158	Void TComTrQuant::storeSliceQpNext(TComSlice* pcSlice)
159	{
160	// NOTE: does this work with negative QPs or when some blocks are transquant-bypass enabled?
161
162	Int qpBase = pcSlice->getSliceQpBase();
163	Int sliceQpused = pcSlice->getSliceQp();
164	Int sliceQpnext;
165	Double alpha = qpBase < 17 ? 0.5 : 1;
166
167	Int cnt=0;
168	for(Int u=1; u<=LEVEL_RANGE; u++)
169	{
170	cnt += m_sliceNsamples[u] ;
171	}
172
173	if( !m_useRDOQ )
174	{
175	sliceQpused = qpBase;
176	alpha = 0.5;
177	}
178
179	if( cnt > 120 )
180	{
181	Double sum = 0;
182	Int k = 0;
183	for(Int u=1; u<LEVEL_RANGE; u++)
184	{
185	sum += u*m_sliceSumC[u];
186	k += uum_sliceNsamples[u];
187	}
188
189	Int v;
190	Double q[MAX_QP+1] ;
191	for(v=0; v<=MAX_QP; v++)
192	{
193	q[v] = (Double)(g_invQuantScales[v%6] * (1<<(v/6)))/64 ;
194	}
195
196	Double qnext = sum/k * q[sliceQpused] / (1<<ARL_C_PRECISION);
197
198	for(v=0; v<MAX_QP; v++)
199	{
200	if(qnext < alpha * q[v] + (1 - alpha) * q[v+1] )
201	{
202	break;
203	}
204	}
205	sliceQpnext = Clip3(sliceQpused - 3, sliceQpused + 3, v);
206	}
207	else
208	{
209	sliceQpnext = sliceQpused;
210	}
211
212	m_qpDelta[qpBase] = sliceQpnext - qpBase;
213	}
214
215	Void TComTrQuant::initSliceQpDelta()
216	{
217	for(Int qp=0; qp<=MAX_QP; qp++)
218	{
219	m_qpDelta[qp] = qp < 17 ? 0 : 1;
220	}
221	}
222
223	Void TComTrQuant::clearSliceARLCnt()
224	{
225	memset(m_sliceSumC, 0, sizeof(Double)*(LEVEL_RANGE+1));
226	memset(m_sliceNsamples, 0, sizeof(Int)*(LEVEL_RANGE+1));
227	}
228	#endif
229
230
231
232	#if MATRIX_MULT
233	/** NxN forward transform (2D) using brute force matrix multiplication (3 nested loops)
234	* \param block pointer to input data (residual)
235	* \param coeff pointer to output data (transform coefficients)
236	* \param uiStride stride of input data
237	* \param uiTrSize transform size (uiTrSize x uiTrSize)
238	* \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
239	*/
240	Void xTr(Int bitDepth, Pel block, TCoeff coeff, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
241	{
242	UInt i,j,k;
243	TCoeff iSum;
244	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
245	const TMatrixCoeff *iT;
246	UInt uiLog2TrSize = g_aucConvertToBit[ uiTrSize ] + 2;
247
248	if (uiTrSize==4)
249	{
250	iT = (useDST ? g_as_DST_MAT_4[TRANSFORM_FORWARD][0] : g_aiT4[TRANSFORM_FORWARD][0]);
251	}
252	else if (uiTrSize==8)
253	{
254	iT = g_aiT8[TRANSFORM_FORWARD][0];
255	}
256	else if (uiTrSize==16)
257	{
258	iT = g_aiT16[TRANSFORM_FORWARD][0];
259	}
260	else if (uiTrSize==32)
261	{
262	iT = g_aiT32[TRANSFORM_FORWARD][0];
263	}
264	else
265	{
266	assert(0);
267	}
268
269	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
270
271	const Int shift_1st = (uiLog2TrSize + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
272	const Int shift_2nd = uiLog2TrSize + TRANSFORM_MATRIX_SHIFT;
273	const Int add_1st = (shift_1st>0) ? (1<<(shift_1st-1)) : 0;
274	const Int add_2nd = 1<<(shift_2nd-1);
275
276	/* Horizontal transform */
277
278	for (i=0; i<uiTrSize; i++)
279	{
280	for (j=0; j<uiTrSize; j++)
281	{
282	iSum = 0;
283	for (k=0; k<uiTrSize; k++)
284	{
285	iSum += iT[iuiTrSize+k]block[j*uiStride+k];
286	}
287	tmp[i*uiTrSize+j] = (iSum + add_1st)>>shift_1st;
288	}
289	}
290
291	/* Vertical transform */
292	for (i=0; i<uiTrSize; i++)
293	{
294	for (j=0; j<uiTrSize; j++)
295	{
296	iSum = 0;
297	for (k=0; k<uiTrSize; k++)
298	{
299	iSum += iT[iuiTrSize+k]tmp[j*uiTrSize+k];
300	}
301	coeff[i*uiTrSize+j] = (iSum + add_2nd)>>shift_2nd;
302	}
303	}
304	}
305
306	/** NxN inverse transform (2D) using brute force matrix multiplication (3 nested loops)
307	* \param coeff pointer to input data (transform coefficients)
308	* \param block pointer to output data (residual)
309	* \param uiStride stride of output data
310	* \param uiTrSize transform size (uiTrSize x uiTrSize)
311	* \param uiMode is Intra Prediction mode used in Mode-Dependent DCT/DST only
312	*/
313	Void xITr(Int bitDepth, TCoeff coeff, Pel block, UInt uiStride, UInt uiTrSize, Bool useDST, const Int maxLog2TrDynamicRange)
314	{
315	UInt i,j,k;
316	TCoeff iSum;
317	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
318	const TMatrixCoeff *iT;
319
320	if (uiTrSize==4)
321	{
322	iT = (useDST ? g_as_DST_MAT_4[TRANSFORM_INVERSE][0] : g_aiT4[TRANSFORM_INVERSE][0]);
323	}
324	else if (uiTrSize==8)
325	{
326	iT = g_aiT8[TRANSFORM_INVERSE][0];
327	}
328	else if (uiTrSize==16)
329	{
330	iT = g_aiT16[TRANSFORM_INVERSE][0];
331	}
332	else if (uiTrSize==32)
333	{
334	iT = g_aiT32[TRANSFORM_INVERSE][0];
335	}
336	else
337	{
338	assert(0);
339	}
340
341	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
342
343	const Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
344	const Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
345	const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
346	const TCoeff clipMaximum = (1 << maxLog2TrDynamicRange) - 1;
347	assert(shift_2nd>=0);
348	const Int add_1st = 1<<(shift_1st-1);
349	const Int add_2nd = (shift_2nd>0) ? (1<<(shift_2nd-1)) : 0;
350
351	/* Horizontal transform */
352	for (i=0; i<uiTrSize; i++)
353	{
354	for (j=0; j<uiTrSize; j++)
355	{
356	iSum = 0;
357	for (k=0; k<uiTrSize; k++)
358	{
359	iSum += iT[kuiTrSize+i]coeff[k*uiTrSize+j];
360	}
361
362	// Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
363	tmp[i*uiTrSize+j] = Clip3<TCoeff>(clipMinimum, clipMaximum, (iSum + add_1st)>>shift_1st);
364	}
365	}
366
367	/* Vertical transform */
368	for (i=0; i<uiTrSize; i++)
369	{
370	for (j=0; j<uiTrSize; j++)
371	{
372	iSum = 0;
373	for (k=0; k<uiTrSize; k++)
374	{
375	iSum += iT[kuiTrSize+j]tmp[i*uiTrSize+k];
376	}
377
378	block[i*uiStride+j] = Clip3<TCoeff>(std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max(), (iSum + add_2nd)>>shift_2nd);
379	}
380	}
381	}
382
383	#endif //MATRIX_MULT
384
385
386	/** 4x4 forward transform implemented using partial butterfly structure (1D)
387	* \param src input data (residual)
388	* \param dst output data (transform coefficients)
389	* \param shift specifies right shift after 1D transform
390	* \param line
391	*/
392	Void partialButterfly4(TCoeff src, TCoeff dst, Int shift, Int line)
393	{
394	Int j;
395	TCoeff E[2],O[2];
396	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
397
398	for (j=0; j<line; j++)
399	{
400	/* E and O */
401	E[0] = src[0] + src[3];
402	O[0] = src[0] - src[3];
403	E[1] = src[1] + src[2];
404	O[1] = src[1] - src[2];
405
406	dst[0] = (g_aiT4[TRANSFORM_FORWARD][0][0]E[0] + g_aiT4[TRANSFORM_FORWARD][0][1]E[1] + add)>>shift;
407	dst[2line] = (g_aiT4[TRANSFORM_FORWARD][2][0]E[0] + g_aiT4[TRANSFORM_FORWARD][2][1]*E[1] + add)>>shift;
408	dst[line] = (g_aiT4[TRANSFORM_FORWARD][1][0]O[0] + g_aiT4[TRANSFORM_FORWARD][1][1]O[1] + add)>>shift;
409	dst[3line] = (g_aiT4[TRANSFORM_FORWARD][3][0]O[0] + g_aiT4[TRANSFORM_FORWARD][3][1]*O[1] + add)>>shift;
410
411	src += 4;
412	dst ++;
413	}
414	}
415
416	// Fast DST Algorithm. Full matrix multiplication for DST and Fast DST algorithm
417	// give identical results
418	Void fastForwardDst(TCoeff block, TCoeff coeff, Int shift) // input block, output coeff
419	{
420	Int i;
421	TCoeff c[4];
422	TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
423	for (i=0; i<4; i++)
424	{
425	// Intermediate Variables
426	c[0] = block[4*i+0];
427	c[1] = block[4*i+1];
428	c[2] = block[4*i+2];
429	c[3] = block[4*i+3];
430
431	for (Int row = 0; row < 4; row++)
432	{
433	TCoeff result = 0;
434	for (Int column = 0; column < 4; column++)
435	{
436	result += c[column] * g_as_DST_MAT_4[TRANSFORM_FORWARD][row][column]; // use the defined matrix, rather than hard-wired numbers
437	}
438
439	coeff[(row * 4) + i] = rightShift((result + rnd_factor), shift);
440	}
441	}
442	}
443
444	Void fastInverseDst(TCoeff tmp, TCoeff block, Int shift, const TCoeff outputMinimum, const TCoeff outputMaximum) // input tmp, output block
445	{
446	Int i;
447	TCoeff c[4];
448	TCoeff rnd_factor = (shift > 0) ? (1<<(shift-1)) : 0;
449	for (i=0; i<4; i++)
450	{
451	// Intermediate Variables
452	c[0] = tmp[ i];
453	c[1] = tmp[4 +i];
454	c[2] = tmp[8 +i];
455	c[3] = tmp[12+i];
456
457	for (Int column = 0; column < 4; column++)
458	{
459	TCoeff &result = block[(i * 4) + column];
460
461	result = 0;
462	for (Int row = 0; row < 4; row++)
463	{
464	result += c[row] * g_as_DST_MAT_4[TRANSFORM_INVERSE][row][column]; // use the defined matrix, rather than hard-wired numbers
465	}
466
467	result = Clip3( outputMinimum, outputMaximum, rightShift((result + rnd_factor), shift));
468	}
469	}
470	}
471
472	/** 4x4 inverse transform implemented using partial butterfly structure (1D)
473	* \param src input data (transform coefficients)
474	* \param dst output data (residual)
475	* \param shift specifies right shift after 1D transform
476	* \param line
477	* \param outputMinimum minimum for clipping
478	* \param outputMaximum maximum for clipping
479	*/
480	Void partialButterflyInverse4(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
481	{
482	Int j;
483	TCoeff E[2],O[2];
484	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
485
486	for (j=0; j<line; j++)
487	{
488	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
489	O[0] = g_aiT4[TRANSFORM_INVERSE][1][0]src[line] + g_aiT4[TRANSFORM_INVERSE][3][0]src[3*line];
490	O[1] = g_aiT4[TRANSFORM_INVERSE][1][1]src[line] + g_aiT4[TRANSFORM_INVERSE][3][1]src[3*line];
491	E[0] = g_aiT4[TRANSFORM_INVERSE][0][0]src[0] + g_aiT4[TRANSFORM_INVERSE][2][0]src[2*line];
492	E[1] = g_aiT4[TRANSFORM_INVERSE][0][1]src[0] + g_aiT4[TRANSFORM_INVERSE][2][1]src[2*line];
493
494	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
495	dst[0] = Clip3( outputMinimum, outputMaximum, (E[0] + O[0] + add)>>shift );
496	dst[1] = Clip3( outputMinimum, outputMaximum, (E[1] + O[1] + add)>>shift );
497	dst[2] = Clip3( outputMinimum, outputMaximum, (E[1] - O[1] + add)>>shift );
498	dst[3] = Clip3( outputMinimum, outputMaximum, (E[0] - O[0] + add)>>shift );
499
500	src ++;
501	dst += 4;
502	}
503	}
504
505	/** 8x8 forward transform implemented using partial butterfly structure (1D)
506	* \param src input data (residual)
507	* \param dst output data (transform coefficients)
508	* \param shift specifies right shift after 1D transform
509	* \param line
510	*/
511	Void partialButterfly8(TCoeff src, TCoeff dst, Int shift, Int line)
512	{
513	Int j,k;
514	TCoeff E[4],O[4];
515	TCoeff EE[2],EO[2];
516	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
517
518	for (j=0; j<line; j++)
519	{
520	/* E and O*/
521	for (k=0;k<4;k++)
522	{
523	E[k] = src[k] + src[7-k];
524	O[k] = src[k] - src[7-k];
525	}
526	/* EE and EO */
527	EE[0] = E[0] + E[3];
528	EO[0] = E[0] - E[3];
529	EE[1] = E[1] + E[2];
530	EO[1] = E[1] - E[2];
531
532	dst[0] = (g_aiT8[TRANSFORM_FORWARD][0][0]EE[0] + g_aiT8[TRANSFORM_FORWARD][0][1]EE[1] + add)>>shift;
533	dst[4line] = (g_aiT8[TRANSFORM_FORWARD][4][0]EE[0] + g_aiT8[TRANSFORM_FORWARD][4][1]*EE[1] + add)>>shift;
534	dst[2line] = (g_aiT8[TRANSFORM_FORWARD][2][0]EO[0] + g_aiT8[TRANSFORM_FORWARD][2][1]*EO[1] + add)>>shift;
535	dst[6line] = (g_aiT8[TRANSFORM_FORWARD][6][0]EO[0] + g_aiT8[TRANSFORM_FORWARD][6][1]*EO[1] + add)>>shift;
536
537	dst[line] = (g_aiT8[TRANSFORM_FORWARD][1][0]O[0] + g_aiT8[TRANSFORM_FORWARD][1][1]O[1] + g_aiT8[TRANSFORM_FORWARD][1][2]O[2] + g_aiT8[TRANSFORM_FORWARD][1][3]O[3] + add)>>shift;
538	dst[3line] = (g_aiT8[TRANSFORM_FORWARD][3][0]O[0] + g_aiT8[TRANSFORM_FORWARD][3][1]O[1] + g_aiT8[TRANSFORM_FORWARD][3][2]O[2] + g_aiT8[TRANSFORM_FORWARD][3][3]*O[3] + add)>>shift;
539	dst[5line] = (g_aiT8[TRANSFORM_FORWARD][5][0]O[0] + g_aiT8[TRANSFORM_FORWARD][5][1]O[1] + g_aiT8[TRANSFORM_FORWARD][5][2]O[2] + g_aiT8[TRANSFORM_FORWARD][5][3]*O[3] + add)>>shift;
540	dst[7line] = (g_aiT8[TRANSFORM_FORWARD][7][0]O[0] + g_aiT8[TRANSFORM_FORWARD][7][1]O[1] + g_aiT8[TRANSFORM_FORWARD][7][2]O[2] + g_aiT8[TRANSFORM_FORWARD][7][3]*O[3] + add)>>shift;
541
542	src += 8;
543	dst ++;
544	}
545	}
546
547	/** 8x8 inverse transform implemented using partial butterfly structure (1D)
548	* \param src input data (transform coefficients)
549	* \param dst output data (residual)
550	* \param shift specifies right shift after 1D transform
551	* \param line
552	* \param outputMinimum minimum for clipping
553	* \param outputMaximum maximum for clipping
554	*/
555	Void partialButterflyInverse8(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
556	{
557	Int j,k;
558	TCoeff E[4],O[4];
559	TCoeff EE[2],EO[2];
560	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
561
562	for (j=0; j<line; j++)
563	{
564	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
565	for (k=0;k<4;k++)
566	{
567	O[k] = g_aiT8[TRANSFORM_INVERSE][ 1][k]src[line] + g_aiT8[TRANSFORM_INVERSE][ 3][k]src[3*line] +
568	g_aiT8[TRANSFORM_INVERSE][ 5][k]src[5line] + g_aiT8[TRANSFORM_INVERSE][ 7][k]src[7line];
569	}
570
571	EO[0] = g_aiT8[TRANSFORM_INVERSE][2][0]src[ 2line ] + g_aiT8[TRANSFORM_INVERSE][6][0]src[ 6line ];
572	EO[1] = g_aiT8[TRANSFORM_INVERSE][2][1]src[ 2line ] + g_aiT8[TRANSFORM_INVERSE][6][1]src[ 6line ];
573	EE[0] = g_aiT8[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT8[TRANSFORM_INVERSE][4][0]src[ 4*line ];
574	EE[1] = g_aiT8[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT8[TRANSFORM_INVERSE][4][1]src[ 4*line ];
575
576	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
577	E[0] = EE[0] + EO[0];
578	E[3] = EE[0] - EO[0];
579	E[1] = EE[1] + EO[1];
580	E[2] = EE[1] - EO[1];
581	for (k=0;k<4;k++)
582	{
583	dst[ k ] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
584	dst[ k+4 ] = Clip3( outputMinimum, outputMaximum, (E[3-k] - O[3-k] + add)>>shift );
585	}
586	src ++;
587	dst += 8;
588	}
589	}
590
591	/** 16x16 forward transform implemented using partial butterfly structure (1D)
592	* \param src input data (residual)
593	* \param dst output data (transform coefficients)
594	* \param shift specifies right shift after 1D transform
595	* \param line
596	*/
597	Void partialButterfly16(TCoeff src, TCoeff dst, Int shift, Int line)
598	{
599	Int j,k;
600	TCoeff E[8],O[8];
601	TCoeff EE[4],EO[4];
602	TCoeff EEE[2],EEO[2];
603	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
604
605	for (j=0; j<line; j++)
606	{
607	/* E and O*/
608	for (k=0;k<8;k++)
609	{
610	E[k] = src[k] + src[15-k];
611	O[k] = src[k] - src[15-k];
612	}
613	/* EE and EO */
614	for (k=0;k<4;k++)
615	{
616	EE[k] = E[k] + E[7-k];
617	EO[k] = E[k] - E[7-k];
618	}
619	/* EEE and EEO */
620	EEE[0] = EE[0] + EE[3];
621	EEO[0] = EE[0] - EE[3];
622	EEE[1] = EE[1] + EE[2];
623	EEO[1] = EE[1] - EE[2];
624
625	dst[ 0 ] = (g_aiT16[TRANSFORM_FORWARD][ 0][0]EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 0][1]EEE[1] + add)>>shift;
626	dst[ 8line ] = (g_aiT16[TRANSFORM_FORWARD][ 8][0]EEE[0] + g_aiT16[TRANSFORM_FORWARD][ 8][1]*EEE[1] + add)>>shift;
627	dst[ 4line ] = (g_aiT16[TRANSFORM_FORWARD][ 4][0]EEO[0] + g_aiT16[TRANSFORM_FORWARD][ 4][1]*EEO[1] + add)>>shift;
628	dst[ 12line] = (g_aiT16[TRANSFORM_FORWARD][12][0]EEO[0] + g_aiT16[TRANSFORM_FORWARD][12][1]*EEO[1] + add)>>shift;
629
630	for (k=2;k<16;k+=4)
631	{
632	dst[ kline ] = (g_aiT16[TRANSFORM_FORWARD][k][0]EO[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*EO[1] +
633	g_aiT16[TRANSFORM_FORWARD][k][2]EO[2] + g_aiT16[TRANSFORM_FORWARD][k][3]EO[3] + add)>>shift;
634	}
635
636	for (k=1;k<16;k+=2)
637	{
638	dst[ kline ] = (g_aiT16[TRANSFORM_FORWARD][k][0]O[0] + g_aiT16[TRANSFORM_FORWARD][k][1]*O[1] +
639	g_aiT16[TRANSFORM_FORWARD][k][2]O[2] + g_aiT16[TRANSFORM_FORWARD][k][3]O[3] +
640	g_aiT16[TRANSFORM_FORWARD][k][4]O[4] + g_aiT16[TRANSFORM_FORWARD][k][5]O[5] +
641	g_aiT16[TRANSFORM_FORWARD][k][6]O[6] + g_aiT16[TRANSFORM_FORWARD][k][7]O[7] + add)>>shift;
642	}
643
644	src += 16;
645	dst ++;
646
647	}
648	}
649
650	/** 16x16 inverse transform implemented using partial butterfly structure (1D)
651	* \param src input data (transform coefficients)
652	* \param dst output data (residual)
653	* \param shift specifies right shift after 1D transform
654	* \param line
655	* \param outputMinimum minimum for clipping
656	* \param outputMaximum maximum for clipping
657	*/
658	Void partialButterflyInverse16(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
659	{
660	Int j,k;
661	TCoeff E[8],O[8];
662	TCoeff EE[4],EO[4];
663	TCoeff EEE[2],EEO[2];
664	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
665
666	for (j=0; j<line; j++)
667	{
668	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
669	for (k=0;k<8;k++)
670	{
671	O[k] = g_aiT16[TRANSFORM_INVERSE][ 1][k]src[ line] + g_aiT16[TRANSFORM_INVERSE][ 3][k]src[ 3*line] +
672	g_aiT16[TRANSFORM_INVERSE][ 5][k]src[ 5line] + g_aiT16[TRANSFORM_INVERSE][ 7][k]src[ 7line] +
673	g_aiT16[TRANSFORM_INVERSE][ 9][k]src[ 9line] + g_aiT16[TRANSFORM_INVERSE][11][k]src[11line] +
674	g_aiT16[TRANSFORM_INVERSE][13][k]src[13line] + g_aiT16[TRANSFORM_INVERSE][15][k]src[15line];
675	}
676	for (k=0;k<4;k++)
677	{
678	EO[k] = g_aiT16[TRANSFORM_INVERSE][ 2][k]src[ 2line] + g_aiT16[TRANSFORM_INVERSE][ 6][k]src[ 6line] +
679	g_aiT16[TRANSFORM_INVERSE][10][k]src[10line] + g_aiT16[TRANSFORM_INVERSE][14][k]src[14line];
680	}
681	EEO[0] = g_aiT16[TRANSFORM_INVERSE][4][0]src[ 4line ] + g_aiT16[TRANSFORM_INVERSE][12][0]src[ 12line ];
682	EEE[0] = g_aiT16[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT16[TRANSFORM_INVERSE][ 8][0]src[ 8*line ];
683	EEO[1] = g_aiT16[TRANSFORM_INVERSE][4][1]src[ 4line ] + g_aiT16[TRANSFORM_INVERSE][12][1]src[ 12line ];
684	EEE[1] = g_aiT16[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT16[TRANSFORM_INVERSE][ 8][1]src[ 8*line ];
685
686	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
687	for (k=0;k<2;k++)
688	{
689	EE[k] = EEE[k] + EEO[k];
690	EE[k+2] = EEE[1-k] - EEO[1-k];
691	}
692	for (k=0;k<4;k++)
693	{
694	E[k] = EE[k] + EO[k];
695	E[k+4] = EE[3-k] - EO[3-k];
696	}
697	for (k=0;k<8;k++)
698	{
699	dst[k] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
700	dst[k+8] = Clip3( outputMinimum, outputMaximum, (E[7-k] - O[7-k] + add)>>shift );
701	}
702	src ++;
703	dst += 16;
704	}
705	}
706
707	/** 32x32 forward transform implemented using partial butterfly structure (1D)
708	* \param src input data (residual)
709	* \param dst output data (transform coefficients)
710	* \param shift specifies right shift after 1D transform
711	* \param line
712	*/
713	Void partialButterfly32(TCoeff src, TCoeff dst, Int shift, Int line)
714	{
715	Int j,k;
716	TCoeff E[16],O[16];
717	TCoeff EE[8],EO[8];
718	TCoeff EEE[4],EEO[4];
719	TCoeff EEEE[2],EEEO[2];
720	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
721
722	for (j=0; j<line; j++)
723	{
724	/* E and O*/
725	for (k=0;k<16;k++)
726	{
727	E[k] = src[k] + src[31-k];
728	O[k] = src[k] - src[31-k];
729	}
730	/* EE and EO */
731	for (k=0;k<8;k++)
732	{
733	EE[k] = E[k] + E[15-k];
734	EO[k] = E[k] - E[15-k];
735	}
736	/* EEE and EEO */
737	for (k=0;k<4;k++)
738	{
739	EEE[k] = EE[k] + EE[7-k];
740	EEO[k] = EE[k] - EE[7-k];
741	}
742	/* EEEE and EEEO */
743	EEEE[0] = EEE[0] + EEE[3];
744	EEEO[0] = EEE[0] - EEE[3];
745	EEEE[1] = EEE[1] + EEE[2];
746	EEEO[1] = EEE[1] - EEE[2];
747
748	dst[ 0 ] = (g_aiT32[TRANSFORM_FORWARD][ 0][0]EEEE[0] + g_aiT32[TRANSFORM_FORWARD][ 0][1]EEEE[1] + add)>>shift;
749	dst[ 16line ] = (g_aiT32[TRANSFORM_FORWARD][16][0]EEEE[0] + g_aiT32[TRANSFORM_FORWARD][16][1]*EEEE[1] + add)>>shift;
750	dst[ 8line ] = (g_aiT32[TRANSFORM_FORWARD][ 8][0]EEEO[0] + g_aiT32[TRANSFORM_FORWARD][ 8][1]*EEEO[1] + add)>>shift;
751	dst[ 24line ] = (g_aiT32[TRANSFORM_FORWARD][24][0]EEEO[0] + g_aiT32[TRANSFORM_FORWARD][24][1]*EEEO[1] + add)>>shift;
752	for (k=4;k<32;k+=8)
753	{
754	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][0]EEO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EEO[1] +
755	g_aiT32[TRANSFORM_FORWARD][k][2]EEO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]EEO[3] + add)>>shift;
756	}
757	for (k=2;k<32;k+=4)
758	{
759	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][0]EO[0] + g_aiT32[TRANSFORM_FORWARD][k][1]*EO[1] +
760	g_aiT32[TRANSFORM_FORWARD][k][2]EO[2] + g_aiT32[TRANSFORM_FORWARD][k][3]EO[3] +
761	g_aiT32[TRANSFORM_FORWARD][k][4]EO[4] + g_aiT32[TRANSFORM_FORWARD][k][5]EO[5] +
762	g_aiT32[TRANSFORM_FORWARD][k][6]EO[6] + g_aiT32[TRANSFORM_FORWARD][k][7]EO[7] + add)>>shift;
763	}
764	for (k=1;k<32;k+=2)
765	{
766	dst[ kline ] = (g_aiT32[TRANSFORM_FORWARD][k][ 0]O[ 0] + g_aiT32[TRANSFORM_FORWARD][k][ 1]*O[ 1] +
767	g_aiT32[TRANSFORM_FORWARD][k][ 2]O[ 2] + g_aiT32[TRANSFORM_FORWARD][k][ 3]O[ 3] +
768	g_aiT32[TRANSFORM_FORWARD][k][ 4]O[ 4] + g_aiT32[TRANSFORM_FORWARD][k][ 5]O[ 5] +
769	g_aiT32[TRANSFORM_FORWARD][k][ 6]O[ 6] + g_aiT32[TRANSFORM_FORWARD][k][ 7]O[ 7] +
770	g_aiT32[TRANSFORM_FORWARD][k][ 8]O[ 8] + g_aiT32[TRANSFORM_FORWARD][k][ 9]O[ 9] +
771	g_aiT32[TRANSFORM_FORWARD][k][10]O[10] + g_aiT32[TRANSFORM_FORWARD][k][11]O[11] +
772	g_aiT32[TRANSFORM_FORWARD][k][12]O[12] + g_aiT32[TRANSFORM_FORWARD][k][13]O[13] +
773	g_aiT32[TRANSFORM_FORWARD][k][14]O[14] + g_aiT32[TRANSFORM_FORWARD][k][15]O[15] + add)>>shift;
774	}
775
776	src += 32;
777	dst ++;
778	}
779	}
780
781	/** 32x32 inverse transform implemented using partial butterfly structure (1D)
782	* \param src input data (transform coefficients)
783	* \param dst output data (residual)
784	* \param shift specifies right shift after 1D transform
785	* \param line
786	* \param outputMinimum minimum for clipping
787	* \param outputMaximum maximum for clipping
788	*/
789	Void partialButterflyInverse32(TCoeff src, TCoeff dst, Int shift, Int line, const TCoeff outputMinimum, const TCoeff outputMaximum)
790	{
791	Int j,k;
792	TCoeff E[16],O[16];
793	TCoeff EE[8],EO[8];
794	TCoeff EEE[4],EEO[4];
795	TCoeff EEEE[2],EEEO[2];
796	TCoeff add = (shift > 0) ? (1<<(shift-1)) : 0;
797
798	for (j=0; j<line; j++)
799	{
800	/* Utilizing symmetry properties to the maximum to minimize the number of multiplications */
801	for (k=0;k<16;k++)
802	{
803	O[k] = g_aiT32[TRANSFORM_INVERSE][ 1][k]src[ line ] + g_aiT32[TRANSFORM_INVERSE][ 3][k]src[ 3*line ] +
804	g_aiT32[TRANSFORM_INVERSE][ 5][k]src[ 5line ] + g_aiT32[TRANSFORM_INVERSE][ 7][k]src[ 7line ] +
805	g_aiT32[TRANSFORM_INVERSE][ 9][k]src[ 9line ] + g_aiT32[TRANSFORM_INVERSE][11][k]src[ 11line ] +
806	g_aiT32[TRANSFORM_INVERSE][13][k]src[ 13line ] + g_aiT32[TRANSFORM_INVERSE][15][k]src[ 15line ] +
807	g_aiT32[TRANSFORM_INVERSE][17][k]src[ 17line ] + g_aiT32[TRANSFORM_INVERSE][19][k]src[ 19line ] +
808	g_aiT32[TRANSFORM_INVERSE][21][k]src[ 21line ] + g_aiT32[TRANSFORM_INVERSE][23][k]src[ 23line ] +
809	g_aiT32[TRANSFORM_INVERSE][25][k]src[ 25line ] + g_aiT32[TRANSFORM_INVERSE][27][k]src[ 27line ] +
810	g_aiT32[TRANSFORM_INVERSE][29][k]src[ 29line ] + g_aiT32[TRANSFORM_INVERSE][31][k]src[ 31line ];
811	}
812	for (k=0;k<8;k++)
813	{
814	EO[k] = g_aiT32[TRANSFORM_INVERSE][ 2][k]src[ 2line ] + g_aiT32[TRANSFORM_INVERSE][ 6][k]src[ 6line ] +
815	g_aiT32[TRANSFORM_INVERSE][10][k]src[ 10line ] + g_aiT32[TRANSFORM_INVERSE][14][k]src[ 14line ] +
816	g_aiT32[TRANSFORM_INVERSE][18][k]src[ 18line ] + g_aiT32[TRANSFORM_INVERSE][22][k]src[ 22line ] +
817	g_aiT32[TRANSFORM_INVERSE][26][k]src[ 26line ] + g_aiT32[TRANSFORM_INVERSE][30][k]src[ 30line ];
818	}
819	for (k=0;k<4;k++)
820	{
821	EEO[k] = g_aiT32[TRANSFORM_INVERSE][ 4][k]src[ 4line ] + g_aiT32[TRANSFORM_INVERSE][12][k]src[ 12line ] +
822	g_aiT32[TRANSFORM_INVERSE][20][k]src[ 20line ] + g_aiT32[TRANSFORM_INVERSE][28][k]src[ 28line ];
823	}
824	EEEO[0] = g_aiT32[TRANSFORM_INVERSE][8][0]src[ 8line ] + g_aiT32[TRANSFORM_INVERSE][24][0]src[ 24line ];
825	EEEO[1] = g_aiT32[TRANSFORM_INVERSE][8][1]src[ 8line ] + g_aiT32[TRANSFORM_INVERSE][24][1]src[ 24line ];
826	EEEE[0] = g_aiT32[TRANSFORM_INVERSE][0][0]src[ 0 ] + g_aiT32[TRANSFORM_INVERSE][16][0]src[ 16*line ];
827	EEEE[1] = g_aiT32[TRANSFORM_INVERSE][0][1]src[ 0 ] + g_aiT32[TRANSFORM_INVERSE][16][1]src[ 16*line ];
828
829	/* Combining even and odd terms at each hierarchy levels to calculate the final spatial domain vector */
830	EEE[0] = EEEE[0] + EEEO[0];
831	EEE[3] = EEEE[0] - EEEO[0];
832	EEE[1] = EEEE[1] + EEEO[1];
833	EEE[2] = EEEE[1] - EEEO[1];
834	for (k=0;k<4;k++)
835	{
836	EE[k] = EEE[k] + EEO[k];
837	EE[k+4] = EEE[3-k] - EEO[3-k];
838	}
839	for (k=0;k<8;k++)
840	{
841	E[k] = EE[k] + EO[k];
842	E[k+8] = EE[7-k] - EO[7-k];
843	}
844	for (k=0;k<16;k++)
845	{
846	dst[k] = Clip3( outputMinimum, outputMaximum, (E[k] + O[k] + add)>>shift );
847	dst[k+16] = Clip3( outputMinimum, outputMaximum, (E[15-k] - O[15-k] + add)>>shift );
848	}
849	src ++;
850	dst += 32;
851	}
852	}
853
854	/** MxN forward transform (2D)
855	* \param bitDepth [in] bit depth
856	* \param block [in] residual block
857	* \param coeff [out] transform coefficients
858	* \param iWidth [in] width of transform
859	* \param iHeight [in] height of transform
860	* \param useDST [in]
861	* \param maxLog2TrDynamicRange [in]
862
863	*/
864	Void xTrMxN(Int bitDepth, TCoeff block, TCoeff coeff, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
865	{
866	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_FORWARD];
867
868	const Int shift_1st = ((g_aucConvertToBit[iWidth] + 2) + bitDepth + TRANSFORM_MATRIX_SHIFT) - maxLog2TrDynamicRange;
869	const Int shift_2nd = (g_aucConvertToBit[iHeight] + 2) + TRANSFORM_MATRIX_SHIFT;
870
871	assert(shift_1st >= 0);
872	assert(shift_2nd >= 0);
873
874	TCoeff tmp[ MAX_TU_SIZE * MAX_TU_SIZE ];
875
876	switch (iWidth)
877	{
878	case 4:
879	{
880	if ((iHeight == 4) && useDST) // Check for DCT or DST
881	{
882	fastForwardDst( block, tmp, shift_1st );
883	}
884	else
885	{
886	partialButterfly4 ( block, tmp, shift_1st, iHeight );
887	}
888	}
889	break;
890
891	case 8: partialButterfly8 ( block, tmp, shift_1st, iHeight ); break;
892	case 16: partialButterfly16( block, tmp, shift_1st, iHeight ); break;
893	case 32: partialButterfly32( block, tmp, shift_1st, iHeight ); break;
894	default:
895	assert(0); exit (1); break;
896	}
897
898	switch (iHeight)
899	{
900	case 4:
901	{
902	if ((iWidth == 4) && useDST) // Check for DCT or DST
903	{
904	fastForwardDst( tmp, coeff, shift_2nd );
905	}
906	else
907	{
908	partialButterfly4 ( tmp, coeff, shift_2nd, iWidth );
909	}
910	}
911	break;
912
913	case 8: partialButterfly8 ( tmp, coeff, shift_2nd, iWidth ); break;
914	case 16: partialButterfly16( tmp, coeff, shift_2nd, iWidth ); break;
915	case 32: partialButterfly32( tmp, coeff, shift_2nd, iWidth ); break;
916	default:
917	assert(0); exit (1); break;
918	}
919	}
920
921
922	/** MxN inverse transform (2D)
923	* \param bitDepth [in] bit depth
924	* \param coeff [in] transform coefficients
925	* \param block [out] residual block
926	* \param iWidth [in] width of transform
927	* \param iHeight [in] height of transform
928	* \param useDST [in]
929	* \param maxLog2TrDynamicRange [in]
930	*/
931	Void xITrMxN(Int bitDepth, TCoeff coeff, TCoeff block, Int iWidth, Int iHeight, Bool useDST, const Int maxLog2TrDynamicRange)
932	{
933	const Int TRANSFORM_MATRIX_SHIFT = g_transformMatrixShift[TRANSFORM_INVERSE];
934
935	Int shift_1st = TRANSFORM_MATRIX_SHIFT + 1; //1 has been added to shift_1st at the expense of shift_2nd
936	Int shift_2nd = (TRANSFORM_MATRIX_SHIFT + maxLog2TrDynamicRange - 1) - bitDepth;
937	const TCoeff clipMinimum = -(1 << maxLog2TrDynamicRange);
938	const TCoeff clipMaximum = (1 << maxLog2TrDynamicRange) - 1;
939
940	assert(shift_1st >= 0);
941	assert(shift_2nd >= 0);
942
943	TCoeff tmp[MAX_TU_SIZE * MAX_TU_SIZE];
944
945	switch (iHeight)
946	{
947	case 4:
948	{
949	if ((iWidth == 4) && useDST) // Check for DCT or DST
950	{
951	fastInverseDst( coeff, tmp, shift_1st, clipMinimum, clipMaximum);
952	}
953	else
954	{
955	partialButterflyInverse4 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum);
956	}
957	}
958	break;
959
960	case 8: partialButterflyInverse8 ( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
961	case 16: partialButterflyInverse16( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
962	case 32: partialButterflyInverse32( coeff, tmp, shift_1st, iWidth, clipMinimum, clipMaximum); break;
963
964	default:
965	assert(0); exit (1); break;
966	}
967
968	switch (iWidth)
969	{
970	// Clipping here is not in the standard, but is used to protect the "Pel" data type into which the inverse-transformed samples will be copied
971	case 4:
972	{
973	if ((iHeight == 4) && useDST) // Check for DCT or DST
974	{
975	fastInverseDst( tmp, block, shift_2nd, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max() );
976	}
977	else
978	{
979	partialButterflyInverse4 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max());
980	}
981	}
982	break;
983
984	case 8: partialButterflyInverse8 ( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
985	case 16: partialButterflyInverse16( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
986	case 32: partialButterflyInverse32( tmp, block, shift_2nd, iHeight, std::numeric_limits<Pel>::min(), std::numeric_limits<Pel>::max()); break;
987
988	default:
989	assert(0); exit (1); break;
990	}
991	}
992
993
994	// To minimize the distortion only. No rate is considered.
995	Void TComTrQuant::signBitHidingHDQ( TCoeff* pQCoef, TCoeff* pCoef, TCoeff* deltaU, const TUEntropyCodingParameters &codingParameters, const Int maxLog2TrDynamicRange )
996	{
997	const UInt width = codingParameters.widthInGroups << MLS_CG_LOG2_WIDTH;
998	const UInt height = codingParameters.heightInGroups << MLS_CG_LOG2_HEIGHT;
999	const UInt groupSize = 1 << MLS_CG_SIZE;
1000
1001	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
1002	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
1003
1004	Int lastCG = -1;
1005	Int absSum = 0 ;
1006	Int n ;
1007
1008	for( Int subSet = (width*height-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
1009	{
1010	Int subPos = subSet << MLS_CG_SIZE;
1011	Int firstNZPosInCG=groupSize , lastNZPosInCG=-1 ;
1012	absSum = 0 ;
1013
1014	for(n = groupSize-1; n >= 0; --n )
1015	{
1016	if( pQCoef[ codingParameters.scan[ n + subPos ]] )
1017	{
1018	lastNZPosInCG = n;
1019	break;
1020	}
1021	}
1022
1023	for(n = 0; n <groupSize; n++ )
1024	{
1025	if( pQCoef[ codingParameters.scan[ n + subPos ]] )
1026	{
1027	firstNZPosInCG = n;
1028	break;
1029	}
1030	}
1031
1032	for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
1033	{
1034	absSum += Int(pQCoef[ codingParameters.scan[ n + subPos ]]);
1035	}
1036
1037	if(lastNZPosInCG>=0 && lastCG==-1)
1038	{
1039	lastCG = 1 ;
1040	}
1041
1042	if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
1043	{
1044	UInt signbit = (pQCoef[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1) ;
1045	if( signbit!=(absSum&0x1) ) //compare signbit with sum_parity
1046	{
1047	TCoeff curCost = std::numeric_limits<TCoeff>::max();
1048	TCoeff minCostInc = std::numeric_limits<TCoeff>::max();
1049	Int minPos =-1, finalChange=0, curChange=0;
1050
1051	for( n = (lastCG==1?lastNZPosInCG:groupSize-1) ; n >= 0; --n )
1052	{
1053	UInt blkPos = codingParameters.scan[ n+subPos ];
1054	if(pQCoef[ blkPos ] != 0 )
1055	{
1056	if(deltaU[blkPos]>0)
1057	{
1058	curCost = - deltaU[blkPos];
1059	curChange=1 ;
1060	}
1061	else
1062	{
1063	//curChange =-1;
1064	if(n==firstNZPosInCG && abs(pQCoef[blkPos])==1)
1065	{
1066	curCost = std::numeric_limits<TCoeff>::max();
1067	}
1068	else
1069	{
1070	curCost = deltaU[blkPos];
1071	curChange =-1;
1072	}
1073	}
1074	}
1075	else
1076	{
1077	if(n<firstNZPosInCG)
1078	{
1079	UInt thisSignBit = (pCoef[blkPos]>=0?0:1);
1080	if(thisSignBit != signbit )
1081	{
1082	curCost = std::numeric_limits<TCoeff>::max();
1083	}
1084	else
1085	{
1086	curCost = - (deltaU[blkPos]) ;
1087	curChange = 1 ;
1088	}
1089	}
1090	else
1091	{
1092	curCost = - (deltaU[blkPos]) ;
1093	curChange = 1 ;
1094	}
1095	}
1096
1097	if( curCost<minCostInc)
1098	{
1099	minCostInc = curCost ;
1100	finalChange = curChange ;
1101	minPos = blkPos ;
1102	}
1103	} //CG loop
1104
1105	if(pQCoef[minPos] == entropyCodingMaximum \|\| pQCoef[minPos] == entropyCodingMinimum)
1106	{
1107	finalChange = -1;
1108	}
1109
1110	if(pCoef[minPos]>=0)
1111	{
1112	pQCoef[minPos] += finalChange ;
1113	}
1114	else
1115	{
1116	pQCoef[minPos] -= finalChange ;
1117	}
1118	} // Hide
1119	}
1120	if(lastCG==1)
1121	{
1122	lastCG=0 ;
1123	}
1124	} // TU loop
1125
1126	return;
1127	}
1128
1129
1130	Void TComTrQuant::xQuant( TComTU &rTu,
1131	TCoeff * pSrc,
1132	TCoeff * pDes,
1133	#if ADAPTIVE_QP_SELECTION
1134	TCoeff *pArlDes,
1135	#endif
1136	TCoeff &uiAbsSum,
1137	const ComponentID compID,
1138	const QpParam &cQP )
1139	{
1140	const TComRectangle &rect = rTu.getRect(compID);
1141	const UInt uiWidth = rect.width;
1142	const UInt uiHeight = rect.height;
1143	TComDataCU* pcCU = rTu.getCU();
1144	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1145	#if SVC_EXTENSION
1146	const Int channelBitDepth = pcCU->getSlice()->getBitDepth(toChannelType(compID));
1147	#else
1148	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1149	#endif
1150
1151	TCoeff* piCoef = pSrc;
1152	TCoeff* piQCoef = pDes;
1153	#if ADAPTIVE_QP_SELECTION
1154	TCoeff* piArlCCoef = pArlDes;
1155	#endif
1156
1157	const Bool useTransformSkip = pcCU->getTransformSkip(uiAbsPartIdx, compID);
1158	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
1159
1160	Bool useRDOQ = useTransformSkip ? m_useRDOQTS : m_useRDOQ;
1161	if ( useRDOQ && (isLuma(compID) \|\| RDOQ_CHROMA) )
1162	{
1163	#if T0196_SELECTIVE_RDOQ
1164	if ( !m_useSelectiveRDOQ \|\| xNeedRDOQ( rTu, piCoef, compID, cQP ) )
1165	{
1166	#endif
1167	#if ADAPTIVE_QP_SELECTION
1168	xRateDistOptQuant( rTu, piCoef, pDes, pArlDes, uiAbsSum, compID, cQP );
1169	#else
1170	xRateDistOptQuant( rTu, piCoef, pDes, uiAbsSum, compID, cQP );
1171	#endif
1172	#if T0196_SELECTIVE_RDOQ
1173	}
1174	else
1175	{
1176	memset( pDes, 0, sizeof( TCoeff ) * uiWidth *uiHeight );
1177	uiAbsSum = 0;
1178	}
1179	#endif
1180	}
1181	else
1182	{
1183	TUEntropyCodingParameters codingParameters;
1184	getTUEntropyCodingParameters(codingParameters, rTu, compID);
1185
1186	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
1187	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
1188
1189	TCoeff deltaU[MAX_TU_SIZE * MAX_TU_SIZE];
1190
1191	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
1192
1193	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
1194	assert(scalingListType < SCALING_LIST_NUM);
1195	Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
1196
1197	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
1198	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
1199
1200	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
1201	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
1202	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
1203	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
1204	*/
1205
1206	// Represents scaling through forward transform
1207	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
1208	if (useTransformSkip && pcCU->getSlice()->getSPS()->getUseExtendedPrecision())
1209	{
1210	iTransformShift = std::max<Int>(0, iTransformShift);
1211	}
1212
1213	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
1214	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
1215
1216	#if ADAPTIVE_QP_SELECTION
1217	Int iQBitsC = MAX_INT;
1218	Int iAddC = MAX_INT;
1219
1220	if (m_bUseAdaptQpSelect)
1221	{
1222	iQBitsC = iQBits - ARL_C_PRECISION;
1223	iAddC = 1 << (iQBitsC-1);
1224	}
1225	#endif
1226
1227	const Int iAdd = (pcCU->getSlice()->getSliceType()==I_SLICE ? 171 : 85) << (iQBits-9);
1228	const Int qBits8 = iQBits - 8;
1229
1230	for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
1231	{
1232	const TCoeff iLevel = piCoef[uiBlockPos];
1233	const TCoeff iSign = (iLevel < 0 ? -1: 1);
1234
1235	const Int64 tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
1236
1237	#if ADAPTIVE_QP_SELECTION
1238	if( m_bUseAdaptQpSelect )
1239	{
1240	piArlCCoef[uiBlockPos] = (TCoeff)((tmpLevel + iAddC ) >> iQBitsC);
1241	}
1242	#endif
1243
1244	const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
1245	deltaU[uiBlockPos] = (TCoeff)((tmpLevel - (quantisedMagnitude<<iQBits) )>> qBits8);
1246
1247	uiAbsSum += quantisedMagnitude;
1248	const TCoeff quantisedCoefficient = quantisedMagnitude * iSign;
1249
1250	piQCoef[uiBlockPos] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
1251	} // for n
1252
1253	if( pcCU->getSlice()->getPPS()->getSignHideFlag() )
1254	{
1255	if(uiAbsSum >= 2) //this prevents TUs with only one coefficient of value 1 from being tested
1256	{
1257	signBitHidingHDQ( piQCoef, piCoef, deltaU, codingParameters, maxLog2TrDynamicRange ) ;
1258	}
1259	}
1260	} //if RDOQ
1261	//return;
1262	}
1263
1264	#if T0196_SELECTIVE_RDOQ
1265	Bool TComTrQuant::xNeedRDOQ( TComTU &rTu, TCoeff * pSrc, const ComponentID compID, const QpParam &cQP )
1266	{
1267	const TComRectangle &rect = rTu.getRect(compID);
1268	const UInt uiWidth = rect.width;
1269	const UInt uiHeight = rect.height;
1270	TComDataCU* pcCU = rTu.getCU();
1271	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1272	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1273
1274	TCoeff* piCoef = pSrc;
1275
1276	const Bool useTransformSkip = pcCU->getTransformSkip(uiAbsPartIdx, compID);
1277	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
1278
1279	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
1280
1281	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
1282	assert(scalingListType < SCALING_LIST_NUM);
1283	Int *piQuantCoeff = getQuantCoeff(scalingListType, cQP.rem, uiLog2TrSize-2);
1284
1285	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
1286	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
1287
1288	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
1289	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
1290	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
1291	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
1292	*/
1293
1294	// Represents scaling through forward transform
1295	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
1296	if (useTransformSkip && pcCU->getSlice()->getSPS()->getUseExtendedPrecision())
1297	{
1298	iTransformShift = std::max<Int>(0, iTransformShift);
1299	}
1300
1301	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
1302	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
1303
1304	// iAdd is different from the iAdd used in normal quantization
1305	const Int iAdd = (compID == COMPONENT_Y ? 171 : 256) << (iQBits-9);
1306
1307	for( Int uiBlockPos = 0; uiBlockPos < uiWidth*uiHeight; uiBlockPos++ )
1308	{
1309	const TCoeff iLevel = piCoef[uiBlockPos];
1310	const Int64 tmpLevel = (Int64)abs(iLevel) * (enableScalingLists ? piQuantCoeff[uiBlockPos] : defaultQuantisationCoefficient);
1311	const TCoeff quantisedMagnitude = TCoeff((tmpLevel + iAdd ) >> iQBits);
1312
1313	if ( quantisedMagnitude != 0 )
1314	{
1315	return true;
1316	}
1317	} // for n
1318	return false;
1319	}
1320	#endif
1321
1322	Void TComTrQuant::xDeQuant( TComTU &rTu,
1323	const TCoeff * pSrc,
1324	TCoeff * pDes,
1325	const ComponentID compID,
1326	const QpParam &cQP )
1327	{
1328	assert(compID<MAX_NUM_COMPONENT);
1329
1330	TComDataCU *pcCU = rTu.getCU();
1331	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1332	const TComRectangle &rect = rTu.getRect(compID);
1333	const UInt uiWidth = rect.width;
1334	const UInt uiHeight = rect.height;
1335	const TCoeff *const piQCoef = pSrc;
1336	TCoeff *const piCoef = pDes;
1337	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
1338	const UInt numSamplesInBlock = uiWidth*uiHeight;
1339	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
1340	const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
1341	const TCoeff transformMaximum = (1 << maxLog2TrDynamicRange) - 1;
1342	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
1343	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
1344	#if O0043_BEST_EFFORT_DECODING
1345	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
1346	#else
1347	#if SVC_EXTENSION
1348	const Int channelBitDepth = pcCU->getSlice()->getBitDepth(toChannelType(compID));
1349	#else
1350	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1351	#endif
1352	#endif
1353
1354	assert (scalingListType < SCALING_LIST_NUM);
1355	assert ( uiWidth <= m_uiMaxTrSize );
1356
1357	// Represents scaling through forward transform
1358	const Bool bClipTransformShiftTo0 = (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && pcCU->getSlice()->getSPS()->getUseExtendedPrecision();
1359	const Int originalTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
1360	const Int iTransformShift = bClipTransformShiftTo0 ? std::max<Int>(0, originalTransformShift) : originalTransformShift;
1361
1362	const Int QP_per = cQP.per;
1363	const Int QP_rem = cQP.rem;
1364
1365	const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
1366
1367	if(enableScalingLists)
1368	{
1369	//from the dequantisation equation:
1370	//iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[deQuantIdx]) + iAdd ) >> rightShift
1371	//(sizeof(Intermediate_Int) * 8) = inputBitDepth + dequantCoefBits - rightShift
1372	const UInt dequantCoefBits = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
1373	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
1374
1375	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
1376	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
1377
1378	Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
1379
1380	if(rightShift > 0)
1381	{
1382	const Intermediate_Int iAdd = 1 << (rightShift - 1);
1383
1384	for( Int n = 0; n < numSamplesInBlock; n++ )
1385	{
1386	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
1387	const Intermediate_Int iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[n]) + iAdd ) >> rightShift;
1388
1389	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
1390	}
1391	}
1392	else
1393	{
1394	const Int leftShift = -rightShift;
1395
1396	for( Int n = 0; n < numSamplesInBlock; n++ )
1397	{
1398	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
1399	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * piDequantCoef[n]) << leftShift;
1400
1401	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
1402	}
1403	}
1404	}
1405	else
1406	{
1407	const Int scale = g_invQuantScales[QP_rem];
1408	const Int scaleBits = (IQUANT_SHIFT + 1) ;
1409
1410	//from the dequantisation equation:
1411	//iCoeffQ = Intermediate_Int((Int64(clipQCoef) * scale + iAdd) >> rightShift);
1412	//(sizeof(Intermediate_Int) * 8) = inputBitDepth + scaleBits - rightShift
1413	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
1414	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
1415	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
1416
1417	if (rightShift > 0)
1418	{
1419	const Intermediate_Int iAdd = 1 << (rightShift - 1);
1420
1421	for( Int n = 0; n < numSamplesInBlock; n++ )
1422	{
1423	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
1424	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
1425
1426	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
1427	}
1428	}
1429	else
1430	{
1431	const Int leftShift = -rightShift;
1432
1433	for( Int n = 0; n < numSamplesInBlock; n++ )
1434	{
1435	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, piQCoef[n]));
1436	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale) << leftShift;
1437
1438	piCoef[n] = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
1439	}
1440	}
1441	}
1442	}
1443
1444
1445	Void TComTrQuant::init( UInt uiMaxTrSize,
1446	Bool bUseRDOQ,
1447	Bool bUseRDOQTS,
1448	#if T0196_SELECTIVE_RDOQ
1449	Bool useSelectiveRDOQ,
1450	#endif
1451	Bool bEnc,
1452	Bool useTransformSkipFast
1453	#if ADAPTIVE_QP_SELECTION
1454	, Bool bUseAdaptQpSelect
1455	#endif
1456	)
1457	{
1458	m_uiMaxTrSize = uiMaxTrSize;
1459	m_bEnc = bEnc;
1460	m_useRDOQ = bUseRDOQ;
1461	m_useRDOQTS = bUseRDOQTS;
1462	#if T0196_SELECTIVE_RDOQ
1463	m_useSelectiveRDOQ = useSelectiveRDOQ;
1464	#endif
1465	#if ADAPTIVE_QP_SELECTION
1466	m_bUseAdaptQpSelect = bUseAdaptQpSelect;
1467	#endif
1468	m_useTransformSkipFast = useTransformSkipFast;
1469	}
1470
1471
1472	Void TComTrQuant::transformNxN( TComTU & rTu,
1473	const ComponentID compID,
1474	Pel * pcResidual,
1475	const UInt uiStride,
1476	TCoeff * rpcCoeff,
1477	#if ADAPTIVE_QP_SELECTION
1478	TCoeff * pcArlCoeff,
1479	#endif
1480	TCoeff & uiAbsSum,
1481	const QpParam & cQP
1482	)
1483	{
1484	const TComRectangle &rect = rTu.getRect(compID);
1485	const UInt uiWidth = rect.width;
1486	const UInt uiHeight = rect.height;
1487	TComDataCU* pcCU = rTu.getCU();
1488	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1489	const UInt uiOrgTrDepth = rTu.GetTransformDepthRel();
1490
1491	uiAbsSum=0;
1492
1493	RDPCMMode rdpcmMode = RDPCM_OFF;
1494	rdpcmNxN( rTu, compID, pcResidual, uiStride, cQP, rpcCoeff, uiAbsSum, rdpcmMode );
1495
1496	if (rdpcmMode == RDPCM_OFF)
1497	{
1498	uiAbsSum = 0;
1499	//transform and quantise
1500	if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
1501	{
1502	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
1503	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
1504
1505	for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
1506	{
1507	for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
1508	{
1509	const Pel currentSample = pcResidual[(y * uiStride) + x];
1510
1511	rpcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = currentSample;
1512	uiAbsSum += TCoeff(abs(currentSample));
1513	}
1514	}
1515	}
1516	else
1517	{
1518	#ifdef DEBUG_TRANSFORM_AND_QUANTISE
1519	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to transform\n";
1520	printBlock(pcResidual, uiWidth, uiHeight, uiStride);
1521	#endif
1522
1523	assert( (pcCU->getSlice()->getSPS()->getMaxTrSize() >= uiWidth) );
1524
1525	if(pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0)
1526	{
1527	xTransformSkip( pcResidual, uiStride, m_plTempCoeff, rTu, compID );
1528	}
1529	else
1530	{
1531	#if SVC_EXTENSION
1532	const Int channelBitDepth=pcCU->getSlice()->getBitDepth(toChannelType(compID));
1533	#else
1534	const Int channelBitDepth=pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1535	#endif
1536	xT( channelBitDepth, rTu.useDST(compID), pcResidual, uiStride, m_plTempCoeff, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
1537	}
1538
1539	#ifdef DEBUG_TRANSFORM_AND_QUANTISE
1540	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between transform and quantiser\n";
1541	printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
1542	#endif
1543
1544	xQuant( rTu, m_plTempCoeff, rpcCoeff,
1545
1546	#if ADAPTIVE_QP_SELECTION
1547	pcArlCoeff,
1548	#endif
1549	uiAbsSum, compID, cQP );
1550
1551	#ifdef DEBUG_TRANSFORM_AND_QUANTISE
1552	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of quantiser\n";
1553	printBlock(rpcCoeff, uiWidth, uiHeight, uiWidth);
1554	#endif
1555	}
1556	}
1557
1558	//set the CBF
1559	pcCU->setCbfPartRange((((uiAbsSum > 0) ? 1 : 0) << uiOrgTrDepth), compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
1560	}
1561
1562
1563	Void TComTrQuant::invTransformNxN( TComTU &rTu,
1564	const ComponentID compID,
1565	Pel *pcResidual,
1566	const UInt uiStride,
1567	TCoeff * pcCoeff,
1568	const QpParam &cQP
1569	DEBUG_STRING_FN_DECLAREP(psDebug))
1570	{
1571	TComDataCU* pcCU=rTu.getCU();
1572	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
1573	const TComRectangle &rect = rTu.getRect(compID);
1574	const UInt uiWidth = rect.width;
1575	const UInt uiHeight = rect.height;
1576
1577	if (uiWidth != uiHeight) //for intra, the TU will have been split above this level, so this condition won't be true, hence this only affects inter
1578	{
1579	//------------------------------------------------
1580
1581	//recurse deeper
1582
1583	TComTURecurse subTURecurse(rTu, false, TComTU::VERTICAL_SPLIT, true, compID);
1584
1585	do
1586	{
1587	//------------------
1588
1589	const UInt lineOffset = subTURecurse.GetSectionNumber() * subTURecurse.getRect(compID).height;
1590
1591	Pel subTUResidual = pcResidual + (lineOffset uiStride);
1592	TCoeff subTUCoefficients = pcCoeff + (lineOffset subTURecurse.getRect(compID).width);
1593
1594	invTransformNxN(subTURecurse, compID, subTUResidual, uiStride, subTUCoefficients, cQP DEBUG_STRING_PASS_INTO(psDebug));
1595
1596	//------------------
1597
1598	} while (subTURecurse.nextSection(rTu));
1599
1600	//------------------------------------------------
1601
1602	return;
1603	}
1604
1605	#if defined DEBUG_STRING
1606	if (psDebug)
1607	{
1608	std::stringstream ss(stringstream::out);
1609	printBlockToStream(ss, (compID==0)?"###InvTran ip Ch0: " : ((compID==1)?"###InvTran ip Ch1: ":"###InvTran ip Ch2: "), pcCoeff, uiWidth, uiHeight, uiWidth);
1610	DEBUG_STRING_APPEND((*psDebug), ss.str())
1611	}
1612	#endif
1613
1614	if(pcCU->getCUTransquantBypass(uiAbsPartIdx))
1615	{
1616	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
1617	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
1618
1619	for (UInt y = 0, coefficientIndex = 0; y<uiHeight; y++)
1620	{
1621	for (UInt x = 0; x<uiWidth; x++, coefficientIndex++)
1622	{
1623	pcResidual[(y * uiStride) + x] = Pel(pcCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex]);
1624	}
1625	}
1626	}
1627	else
1628	{
1629	#ifdef DEBUG_TRANSFORM_AND_QUANTISE
1630	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at input to dequantiser\n";
1631	printBlock(pcCoeff, uiWidth, uiHeight, uiWidth);
1632	#endif
1633
1634	xDeQuant(rTu, pcCoeff, m_plTempCoeff, compID, cQP);
1635
1636	#ifdef DEBUG_TRANSFORM_AND_QUANTISE
1637	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU between dequantiser and inverse-transform\n";
1638	printBlock(m_plTempCoeff, uiWidth, uiHeight, uiWidth);
1639	#endif
1640
1641	#if defined DEBUG_STRING
1642	if (psDebug)
1643	{
1644	std::stringstream ss(stringstream::out);
1645	printBlockToStream(ss, "###InvTran deq: ", m_plTempCoeff, uiWidth, uiHeight, uiWidth);
1646	(*psDebug)+=ss.str();
1647	}
1648	#endif
1649
1650	if(pcCU->getTransformSkip(uiAbsPartIdx, compID))
1651	{
1652	xITransformSkip( m_plTempCoeff, pcResidual, uiStride, rTu, compID );
1653
1654	#if defined DEBUG_STRING
1655	if (psDebug)
1656	{
1657	std::stringstream ss(stringstream::out);
1658	printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
1659	(*psDebug)+=ss.str();
1660	(*psDebug)+="(<- was a Transform-skipped block)\n";
1661	}
1662	#endif
1663	}
1664	else
1665	{
1666	#if O0043_BEST_EFFORT_DECODING
1667	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
1668	#else
1669	#if SVC_EXTENSION
1670	const Int channelBitDepth = pcCU->getSlice()->getBitDepth(toChannelType(compID));
1671	#else
1672	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
1673	#endif
1674	#endif
1675	xIT( channelBitDepth, rTu.useDST(compID), m_plTempCoeff, pcResidual, uiStride, uiWidth, uiHeight, pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID)) );
1676
1677	#if defined DEBUG_STRING
1678	if (psDebug)
1679	{
1680	std::stringstream ss(stringstream::out);
1681	printBlockToStream(ss, "###InvTran resi: ", pcResidual, uiWidth, uiHeight, uiStride);
1682	(*psDebug)+=ss.str();
1683	(*psDebug)+="(<- was a Transformed block)\n";
1684	}
1685	#endif
1686	}
1687
1688	#ifdef DEBUG_TRANSFORM_AND_QUANTISE
1689	std::cout << g_debugCounter << ": " << uiWidth << "x" << uiHeight << " channel " << compID << " TU at output of inverse-transform\n";
1690	printBlock(pcResidual, uiWidth, uiHeight, uiStride);
1691	g_debugCounter++;
1692	#endif
1693	}
1694
1695	invRdpcmNxN( rTu, compID, pcResidual, uiStride );
1696	}
1697
1698	Void TComTrQuant::invRecurTransformNxN( const ComponentID compID,
1699	TComYuv *pResidual,
1700	TComTU &rTu)
1701	{
1702	if (!rTu.ProcessComponentSection(compID))
1703	{
1704	return;
1705	}
1706
1707	TComDataCU* pcCU = rTu.getCU();
1708	UInt absPartIdxTU = rTu.GetAbsPartIdxTU();
1709	UInt uiTrMode=rTu.GetTransformDepthRel();
1710	if( (pcCU->getCbf(absPartIdxTU, compID, uiTrMode) == 0) && (isLuma(compID) \|\| !pcCU->getSlice()->getPPS()->getUseCrossComponentPrediction()) )
1711	{
1712	return;
1713	}
1714
1715	if( uiTrMode == pcCU->getTransformIdx( absPartIdxTU ) )
1716	{
1717	const TComRectangle &tuRect = rTu.getRect(compID);
1718	const Int uiStride = pResidual->getStride( compID );
1719	Pel *rpcResidual = pResidual->getAddr( compID );
1720	UInt uiAddr = (tuRect.x0 + uiStride*tuRect.y0);
1721	Pel *pResi = rpcResidual + uiAddr;
1722	TCoeff *pcCoeff = pcCU->getCoeff(compID) + rTu.getCoefficientOffset(compID);
1723
1724	const QpParam cQP(*pcCU, compID);
1725
1726	if(pcCU->getCbf(absPartIdxTU, compID, uiTrMode) != 0)
1727	{
1728	DEBUG_STRING_NEW(sTemp)
1729	#ifdef DEBUG_STRING
1730	std::string *psDebug=((DebugOptionList::DebugString_InvTran.getInt()&(pcCU->isIntra(absPartIdxTU)?1:(pcCU->isInter(absPartIdxTU)?2:4)))!=0) ? &sTemp : 0;
1731	#endif
1732
1733	invTransformNxN( rTu, compID, pResi, uiStride, pcCoeff, cQP DEBUG_STRING_PASS_INTO(psDebug) );
1734
1735	#ifdef DEBUG_STRING
1736	if (psDebug != 0)
1737	{
1738	std::cout << (*psDebug);
1739	}
1740	#endif
1741	}
1742
1743	if (isChroma(compID) && (pcCU->getCrossComponentPredictionAlpha(absPartIdxTU, compID) != 0))
1744	{
1745	const Pel *piResiLuma = pResidual->getAddr( COMPONENT_Y );
1746	const Int strideLuma = pResidual->getStride( COMPONENT_Y );
1747	const Int tuWidth = rTu.getRect( compID ).width;
1748	const Int tuHeight = rTu.getRect( compID ).height;
1749
1750	if(pcCU->getCbf(absPartIdxTU, COMPONENT_Y, uiTrMode) != 0)
1751	{
1752	pResi = rpcResidual + uiAddr;
1753	const Pel *pResiLuma = piResiLuma + uiAddr;
1754
1755	crossComponentPrediction( rTu, compID, pResiLuma, pResi, pResi, tuWidth, tuHeight, strideLuma, uiStride, uiStride, true );
1756	}
1757	}
1758	}
1759	else
1760	{
1761	TComTURecurse tuRecurseChild(rTu, false);
1762	do
1763	{
1764	invRecurTransformNxN( compID, pResidual, tuRecurseChild );
1765	} while (tuRecurseChild.nextSection(rTu));
1766	}
1767	}
1768
1769	Void TComTrQuant::applyForwardRDPCM( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, const RDPCMMode mode )
1770	{
1771	TComDataCU *pcCU=rTu.getCU();
1772	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
1773
1774	const Bool bLossless = pcCU->getCUTransquantBypass( uiAbsPartIdx );
1775	const UInt uiWidth = rTu.getRect(compID).width;
1776	const UInt uiHeight = rTu.getRect(compID).height;
1777	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(compID);
1778	const UInt uiSizeMinus1 = (uiWidth * uiHeight) - 1;
1779
1780	UInt uiX = 0;
1781	UInt uiY = 0;
1782
1783	UInt &majorAxis = (mode == RDPCM_VER) ? uiX : uiY;
1784	UInt &minorAxis = (mode == RDPCM_VER) ? uiY : uiX;
1785	const UInt majorAxisLimit = (mode == RDPCM_VER) ? uiWidth : uiHeight;
1786	const UInt minorAxisLimit = (mode == RDPCM_VER) ? uiHeight : uiWidth;
1787
1788	const Bool bUseHalfRoundingPoint = (mode != RDPCM_OFF);
1789
1790	uiAbsSum = 0;
1791
1792	for ( majorAxis = 0; majorAxis < majorAxisLimit; majorAxis++ )
1793	{
1794	TCoeff accumulatorValue = 0; // 32-bit accumulator
1795	for ( minorAxis = 0; minorAxis < minorAxisLimit; minorAxis++ )
1796	{
1797	const UInt sampleIndex = (uiY * uiWidth) + uiX;
1798	const UInt coefficientIndex = (rotateResidual ? (uiSizeMinus1-sampleIndex) : sampleIndex);
1799	const Pel currentSample = pcResidual[(uiY * uiStride) + uiX];
1800	const TCoeff encoderSideDelta = TCoeff(currentSample) - accumulatorValue;
1801
1802	Pel reconstructedDelta;
1803	if ( bLossless )
1804	{
1805	pcCoeff[coefficientIndex] = encoderSideDelta;
1806	reconstructedDelta = (Pel) encoderSideDelta;
1807	}
1808	else
1809	{
1810	transformSkipQuantOneSample(rTu, compID, encoderSideDelta, pcCoeff, coefficientIndex, cQP, bUseHalfRoundingPoint);
1811	invTrSkipDeQuantOneSample (rTu, compID, pcCoeff[coefficientIndex], reconstructedDelta, cQP, coefficientIndex);
1812	}
1813
1814	uiAbsSum += abs(pcCoeff[coefficientIndex]);
1815
1816	if (mode != RDPCM_OFF)
1817	{
1818	accumulatorValue += reconstructedDelta;
1819	}
1820	}
1821	}
1822	}
1823
1824	Void TComTrQuant::rdpcmNxN ( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride, const QpParam& cQP, TCoeff* pcCoeff, TCoeff &uiAbsSum, RDPCMMode& rdpcmMode )
1825	{
1826	TComDataCU *pcCU=rTu.getCU();
1827	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
1828
1829	if (!pcCU->isRDPCMEnabled(uiAbsPartIdx) \|\| ((pcCU->getTransformSkip(uiAbsPartIdx, compID) == 0) && !pcCU->getCUTransquantBypass(uiAbsPartIdx)))
1830	{
1831	rdpcmMode = RDPCM_OFF;
1832	}
1833	else if ( pcCU->isIntra( uiAbsPartIdx ) )
1834	{
1835	const ChromaFormat chFmt = pcCU->getPic()->getPicYuvOrg()->getChromaFormat();
1836	const ChannelType chType = toChannelType(compID);
1837	const UInt uiChPredMode = pcCU->getIntraDir( chType, uiAbsPartIdx );
1838	const TComSPS *sps=pcCU->getSlice()->getSPS();
1839	const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
1840	const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
1841	const UInt uiChFinalMode = ((chFmt == CHROMA_422) && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
1842
1843	if (uiChFinalMode == VER_IDX \|\| uiChFinalMode == HOR_IDX)
1844	{
1845	rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
1846	applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, uiAbsSum, rdpcmMode );
1847	}
1848	else
1849	{
1850	rdpcmMode = RDPCM_OFF;
1851	}
1852	}
1853	else // not intra, need to select the best mode
1854	{
1855	const UInt uiWidth = rTu.getRect(compID).width;
1856	const UInt uiHeight = rTu.getRect(compID).height;
1857
1858	RDPCMMode bestMode = NUMBER_OF_RDPCM_MODES;
1859	TCoeff bestAbsSum = std::numeric_limits<TCoeff>::max();
1860	TCoeff bestCoefficients[MAX_TU_SIZE * MAX_TU_SIZE];
1861
1862	for (UInt modeIndex = 0; modeIndex < NUMBER_OF_RDPCM_MODES; modeIndex++)
1863	{
1864	const RDPCMMode mode = RDPCMMode(modeIndex);
1865
1866	TCoeff currAbsSum = 0;
1867
1868	applyForwardRDPCM( rTu, compID, pcResidual, uiStride, cQP, pcCoeff, currAbsSum, mode );
1869
1870	if (currAbsSum < bestAbsSum)
1871	{
1872	bestMode = mode;
1873	bestAbsSum = currAbsSum;
1874	if (mode != RDPCM_OFF)
1875	{
1876	memcpy(bestCoefficients, pcCoeff, (uiWidth * uiHeight * sizeof(TCoeff)));
1877	}
1878	}
1879	}
1880
1881	rdpcmMode = bestMode;
1882	uiAbsSum = bestAbsSum;
1883
1884	if (rdpcmMode != RDPCM_OFF) //the TU is re-transformed and quantised if DPCM_OFF is returned, so there is no need to preserve it here
1885	{
1886	memcpy(pcCoeff, bestCoefficients, (uiWidth * uiHeight * sizeof(TCoeff)));
1887	}
1888	}
1889
1890	pcCU->setExplicitRdpcmModePartRange(rdpcmMode, compID, uiAbsPartIdx, rTu.GetAbsPartIdxNumParts(compID));
1891	}
1892
1893	Void TComTrQuant::invRdpcmNxN( TComTU& rTu, const ComponentID compID, Pel* pcResidual, const UInt uiStride )
1894	{
1895	TComDataCU *pcCU=rTu.getCU();
1896	const UInt uiAbsPartIdx=rTu.GetAbsPartIdxTU();
1897
1898	if (pcCU->isRDPCMEnabled( uiAbsPartIdx ) && ((pcCU->getTransformSkip(uiAbsPartIdx, compID ) != 0) \|\| pcCU->getCUTransquantBypass(uiAbsPartIdx)))
1899	{
1900	const UInt uiWidth = rTu.getRect(compID).width;
1901	const UInt uiHeight = rTu.getRect(compID).height;
1902
1903	RDPCMMode rdpcmMode = RDPCM_OFF;
1904
1905	if ( pcCU->isIntra( uiAbsPartIdx ) )
1906	{
1907	const ChromaFormat chFmt = pcCU->getPic()->getPicYuvRec()->getChromaFormat();
1908	const ChannelType chType = toChannelType(compID);
1909	const UInt uiChPredMode = pcCU->getIntraDir( chType, uiAbsPartIdx );
1910	const TComSPS *sps=pcCU->getSlice()->getSPS();
1911	const UInt partsPerMinCU = 1<<(2*(sps->getMaxTotalCUDepth() - sps->getLog2DiffMaxMinCodingBlockSize()));
1912	const UInt uiChCodedMode = (uiChPredMode==DM_CHROMA_IDX && isChroma(compID)) ? pcCU->getIntraDir(CHANNEL_TYPE_LUMA, getChromasCorrespondingPULumaIdx(uiAbsPartIdx, chFmt, partsPerMinCU)) : uiChPredMode;
1913	const UInt uiChFinalMode = ((chFmt == CHROMA_422) && isChroma(compID)) ? g_chroma422IntraAngleMappingTable[uiChCodedMode] : uiChCodedMode;
1914
1915	if (uiChFinalMode == VER_IDX \|\| uiChFinalMode == HOR_IDX)
1916	{
1917	rdpcmMode = (uiChFinalMode == VER_IDX) ? RDPCM_VER : RDPCM_HOR;
1918	}
1919	}
1920	else // not intra case
1921	{
1922	rdpcmMode = RDPCMMode(pcCU->getExplicitRdpcmMode( compID, uiAbsPartIdx ));
1923	}
1924
1925	const TCoeff pelMin=(TCoeff) std::numeric_limits<Pel>::min();
1926	const TCoeff pelMax=(TCoeff) std::numeric_limits<Pel>::max();
1927	if (rdpcmMode == RDPCM_VER)
1928	{
1929	for( UInt uiX = 0; uiX < uiWidth; uiX++ )
1930	{
1931	Pel *pcCurResidual = pcResidual+uiX;
1932	TCoeff accumulator = *pcCurResidual; // 32-bit accumulator
1933	pcCurResidual+=uiStride;
1934	for( UInt uiY = 1; uiY < uiHeight; uiY++, pcCurResidual+=uiStride )
1935	{
1936	accumulator += *(pcCurResidual);
1937	*pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
1938	}
1939	}
1940	}
1941	else if (rdpcmMode == RDPCM_HOR)
1942	{
1943	for( UInt uiY = 0; uiY < uiHeight; uiY++ )
1944	{
1945	Pel pcCurResidual = pcResidual+uiYuiStride;
1946	TCoeff accumulator = *pcCurResidual;
1947	pcCurResidual++;
1948	for( UInt uiX = 1; uiX < uiWidth; uiX++, pcCurResidual++ )
1949	{
1950	accumulator += *(pcCurResidual);
1951	*pcCurResidual = (Pel)Clip3<TCoeff>(pelMin, pelMax, accumulator);
1952	}
1953	}
1954	}
1955	}
1956	}
1957
1958	// ------------------------------------------------------------------------------------------------
1959	// Logical transform
1960	// ------------------------------------------------------------------------------------------------
1961
1962	/** Wrapper function between HM interface and core NxN forward transform (2D)
1963	* \param channelBitDepth bit depth of channel
1964	* \param useDST
1965	* \param piBlkResi input data (residual)
1966	* \param uiStride stride of input residual data
1967	* \param psCoeff output data (transform coefficients)
1968	* \param iWidth transform width
1969	* \param iHeight transform height
1970	* \param maxLog2TrDynamicRange
1971	*/
1972	Void TComTrQuant::xT( const Int channelBitDepth, Bool useDST, Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
1973	{
1974	#if MATRIX_MULT
1975	if( iWidth == iHeight)
1976	{
1977	xTr(channelBitDepth, piBlkResi, psCoeff, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
1978	return;
1979	}
1980	#endif
1981
1982	TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
1983	TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
1984
1985	for (Int y = 0; y < iHeight; y++)
1986	{
1987	for (Int x = 0; x < iWidth; x++)
1988	{
1989	block[(y * iWidth) + x] = piBlkResi[(y * uiStride) + x];
1990	}
1991	}
1992
1993	xTrMxN( channelBitDepth, block, coeff, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
1994
1995	memcpy(psCoeff, coeff, (iWidth * iHeight * sizeof(TCoeff)));
1996	}
1997
1998	/** Wrapper function between HM interface and core NxN inverse transform (2D)
1999	* \param channelBitDepth bit depth of channel
2000	* \param useDST
2001	* \param plCoef input data (transform coefficients)
2002	* \param pResidual output data (residual)
2003	* \param uiStride stride of input residual data
2004	* \param iWidth transform width
2005	* \param iHeight transform height
2006	* \param maxLog2TrDynamicRange
2007	*/
2008	Void TComTrQuant::xIT( const Int channelBitDepth, Bool useDST, TCoeff* plCoef, Pel* pResidual, UInt uiStride, Int iWidth, Int iHeight, const Int maxLog2TrDynamicRange )
2009	{
2010	#if MATRIX_MULT
2011	if( iWidth == iHeight )
2012	{
2013	xITr(channelBitDepth, plCoef, pResidual, uiStride, (UInt)iWidth, useDST, maxLog2TrDynamicRange);
2014	return;
2015	}
2016	#endif
2017
2018	TCoeff block[ MAX_TU_SIZE * MAX_TU_SIZE ];
2019	TCoeff coeff[ MAX_TU_SIZE * MAX_TU_SIZE ];
2020
2021	memcpy(coeff, plCoef, (iWidth * iHeight * sizeof(TCoeff)));
2022
2023	xITrMxN( channelBitDepth, coeff, block, iWidth, iHeight, useDST, maxLog2TrDynamicRange );
2024
2025	for (Int y = 0; y < iHeight; y++)
2026	{
2027	for (Int x = 0; x < iWidth; x++)
2028	{
2029	pResidual[(y * uiStride) + x] = Pel(block[(y * iWidth) + x]);
2030	}
2031	}
2032	}
2033
2034	/** Wrapper function between HM interface and core 4x4 transform skipping
2035	* \param piBlkResi input data (residual)
2036	* \param uiStride stride of input residual data
2037	* \param psCoeff output data (transform coefficients)
2038	* \param rTu reference to transform data
2039	* \param component colour component
2040	*/
2041	Void TComTrQuant::xTransformSkip( Pel* piBlkResi, UInt uiStride, TCoeff* psCoeff, TComTU &rTu, const ComponentID component )
2042	{
2043	const TComRectangle &rect = rTu.getRect(component);
2044	const Int width = rect.width;
2045	const Int height = rect.height;
2046	const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
2047	#if SVC_EXTENSION
2048	const Int channelBitDepth = rTu.getCU()->getSlice()->getBitDepth(toChannelType(component));
2049	#else
2050	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
2051	#endif
2052
2053	Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
2054	if (rTu.getCU()->getSlice()->getSPS()->getUseExtendedPrecision())
2055	{
2056	iTransformShift = std::max<Int>(0, iTransformShift);
2057	}
2058
2059	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
2060	const UInt uiSizeMinus1 = (width * height) - 1;
2061
2062	if (iTransformShift >= 0)
2063	{
2064	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
2065	{
2066	for (UInt x = 0; x < width; x++, coefficientIndex++)
2067	{
2068	psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = TCoeff(piBlkResi[(y * uiStride) + x]) << iTransformShift;
2069	}
2070	}
2071	}
2072	else //for very high bit depths
2073	{
2074	iTransformShift = -iTransformShift;
2075	const TCoeff offset = 1 << (iTransformShift - 1);
2076
2077	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
2078	{
2079	for (UInt x = 0; x < width; x++, coefficientIndex++)
2080	{
2081	psCoeff[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] = (TCoeff(piBlkResi[(y * uiStride) + x]) + offset) >> iTransformShift;
2082	}
2083	}
2084	}
2085	}
2086
2087	/** Wrapper function between HM interface and core NxN transform skipping
2088	* \param plCoef input data (coefficients)
2089	* \param pResidual output data (residual)
2090	* \param uiStride stride of input residual data
2091	* \param rTu reference to transform data
2092	* \param component colour component ID
2093	*/
2094	Void TComTrQuant::xITransformSkip( TCoeff* plCoef, Pel* pResidual, UInt uiStride, TComTU &rTu, const ComponentID component )
2095	{
2096	const TComRectangle &rect = rTu.getRect(component);
2097	const Int width = rect.width;
2098	const Int height = rect.height;
2099	const Int maxLog2TrDynamicRange = rTu.getCU()->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(component));
2100	#if O0043_BEST_EFFORT_DECODING
2101	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getStreamBitDepth(toChannelType(component));
2102	#else
2103	#if SVC_EXTENSION
2104	const Int channelBitDepth = rTu.getCU()->getSlice()->getBitDepth(toChannelType(component));
2105	#else
2106	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(toChannelType(component));
2107	#endif
2108	#endif
2109
2110	Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(component), maxLog2TrDynamicRange);
2111	if (rTu.getCU()->getSlice()->getSPS()->getUseExtendedPrecision())
2112	{
2113	iTransformShift = std::max<Int>(0, iTransformShift);
2114	}
2115
2116	const Bool rotateResidual = rTu.isNonTransformedResidualRotated(component);
2117	const UInt uiSizeMinus1 = (width * height) - 1;
2118
2119	if (iTransformShift >= 0)
2120	{
2121	const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
2122
2123	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
2124	{
2125	for (UInt x = 0; x < width; x++, coefficientIndex++)
2126	{
2127	pResidual[(y * uiStride) + x] = Pel((plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] + offset) >> iTransformShift);
2128	}
2129	}
2130	}
2131	else //for very high bit depths
2132	{
2133	iTransformShift = -iTransformShift;
2134
2135	for (UInt y = 0, coefficientIndex = 0; y < height; y++)
2136	{
2137	for (UInt x = 0; x < width; x++, coefficientIndex++)
2138	{
2139	pResidual[(y * uiStride) + x] = Pel(plCoef[rotateResidual ? (uiSizeMinus1 - coefficientIndex) : coefficientIndex] << iTransformShift);
2140	}
2141	}
2142	}
2143	}
2144
2145	/** RDOQ with CABAC
2146	* \param rTu reference to transform data
2147	* \param plSrcCoeff pointer to input buffer
2148	* \param piDstCoeff reference to pointer to output buffer
2149	* \param piArlDstCoeff
2150	* \param uiAbsSum reference to absolute sum of quantized transform coefficient
2151	* \param compID colour component ID
2152	* \param cQP reference to quantization parameters
2153
2154	* Rate distortion optimized quantization for entropy
2155	* coding engines using probability models like CABAC
2156	*/
2157	Void TComTrQuant::xRateDistOptQuant ( TComTU &rTu,
2158	TCoeff * plSrcCoeff,
2159	TCoeff * piDstCoeff,
2160	#if ADAPTIVE_QP_SELECTION
2161	TCoeff * piArlDstCoeff,
2162	#endif
2163	TCoeff &uiAbsSum,
2164	const ComponentID compID,
2165	const QpParam &cQP )
2166	{
2167	const TComRectangle & rect = rTu.getRect(compID);
2168	const UInt uiWidth = rect.width;
2169	const UInt uiHeight = rect.height;
2170	TComDataCU * pcCU = rTu.getCU();
2171	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
2172	const ChannelType channelType = toChannelType(compID);
2173	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
2174
2175	const Bool extendedPrecision = pcCU->getSlice()->getSPS()->getUseExtendedPrecision();
2176	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
2177	#if SVC_EXTENSION
2178	const Int channelBitDepth = rTu.getCU()->getSlice()->getBitDepth(channelType);
2179	#else
2180	const Int channelBitDepth = rTu.getCU()->getSlice()->getSPS()->getBitDepth(channelType);
2181	#endif
2182
2183	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
2184	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
2185	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
2186	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
2187	*/
2188
2189	// Represents scaling through forward transform
2190	Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange);
2191	if ((pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0) && pcCU->getSlice()->getSPS()->getUseExtendedPrecision())
2192	{
2193	iTransformShift = std::max<Int>(0, iTransformShift);
2194	}
2195
2196	const Bool bUseGolombRiceParameterAdaptation = pcCU->getSlice()->getSPS()->getUseGolombRiceParameterAdaptation();
2197	const UInt initialGolombRiceParameter = m_pcEstBitsSbac->golombRiceAdaptationStatistics[rTu.getGolombRiceStatisticsIndex(compID)] / RExt__GOLOMB_RICE_INCREMENT_DIVISOR;
2198	UInt uiGoRiceParam = initialGolombRiceParameter;
2199	Double d64BlockUncodedCost = 0;
2200	const UInt uiLog2BlockWidth = g_aucConvertToBit[ uiWidth ] + 2;
2201	const UInt uiLog2BlockHeight = g_aucConvertToBit[ uiHeight ] + 2;
2202	const UInt uiMaxNumCoeff = uiWidth * uiHeight;
2203	assert(compID<MAX_NUM_COMPONENT);
2204
2205	Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
2206	assert(scalingListType < SCALING_LIST_NUM);
2207
2208	#if ADAPTIVE_QP_SELECTION
2209	memset(piArlDstCoeff, 0, sizeof(TCoeff) * uiMaxNumCoeff);
2210	#endif
2211
2212	Double pdCostCoeff [ MAX_TU_SIZE * MAX_TU_SIZE ];
2213	Double pdCostSig [ MAX_TU_SIZE * MAX_TU_SIZE ];
2214	Double pdCostCoeff0[ MAX_TU_SIZE * MAX_TU_SIZE ];
2215	memset( pdCostCoeff, 0, sizeof(Double) * uiMaxNumCoeff );
2216	memset( pdCostSig, 0, sizeof(Double) * uiMaxNumCoeff );
2217	Int rateIncUp [ MAX_TU_SIZE * MAX_TU_SIZE ];
2218	Int rateIncDown [ MAX_TU_SIZE * MAX_TU_SIZE ];
2219	Int sigRateDelta[ MAX_TU_SIZE * MAX_TU_SIZE ];
2220	TCoeff deltaU [ MAX_TU_SIZE * MAX_TU_SIZE ];
2221	memset( rateIncUp, 0, sizeof(Int ) * uiMaxNumCoeff );
2222	memset( rateIncDown, 0, sizeof(Int ) * uiMaxNumCoeff );
2223	memset( sigRateDelta, 0, sizeof(Int ) * uiMaxNumCoeff );
2224	memset( deltaU, 0, sizeof(TCoeff) * uiMaxNumCoeff );
2225
2226	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift; // Right shift of non-RDOQ quantizer; level = (coeff*uiQ + offset)>>q_bits
2227	const Double *const pdErrScale = getErrScaleCoeff(scalingListType, (uiLog2TrSize-2), cQP.rem);
2228	const Int *const piQCoef = getQuantCoeff(scalingListType, cQP.rem, (uiLog2TrSize-2));
2229
2230	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, (pcCU->getTransformSkip(uiAbsPartIdx, compID) != 0));
2231	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
2232	const Double defaultErrorScale = getErrScaleCoeffNoScalingList(scalingListType, (uiLog2TrSize-2), cQP.rem);
2233
2234	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
2235	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
2236
2237	#if ADAPTIVE_QP_SELECTION
2238	Int iQBitsC = iQBits - ARL_C_PRECISION;
2239	Int iAddC = 1 << (iQBitsC-1);
2240	#endif
2241
2242	TUEntropyCodingParameters codingParameters;
2243	getTUEntropyCodingParameters(codingParameters, rTu, compID);
2244	const UInt uiCGSize = (1 << MLS_CG_SIZE);
2245
2246	Double pdCostCoeffGroupSig[ MLS_GRP_NUM ];
2247	UInt uiSigCoeffGroupFlag[ MLS_GRP_NUM ];
2248	Int iCGLastScanPos = -1;
2249
2250	UInt uiCtxSet = 0;
2251	Int c1 = 1;
2252	Int c2 = 0;
2253	Double d64BaseCost = 0;
2254	Int iLastScanPos = -1;
2255
2256	UInt c1Idx = 0;
2257	UInt c2Idx = 0;
2258	Int baseLevel;
2259
2260	memset( pdCostCoeffGroupSig, 0, sizeof(Double) * MLS_GRP_NUM );
2261	memset( uiSigCoeffGroupFlag, 0, sizeof(UInt) * MLS_GRP_NUM );
2262
2263	UInt uiCGNum = uiWidth * uiHeight >> MLS_CG_SIZE;
2264	Int iScanPos;
2265	coeffGroupRDStats rdStats;
2266
2267	const UInt significanceMapContextOffset = getSignificanceMapContextOffset(compID);
2268
2269	for (Int iCGScanPos = uiCGNum-1; iCGScanPos >= 0; iCGScanPos--)
2270	{
2271	UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
2272	UInt uiCGPosY = uiCGBlkPos / codingParameters.widthInGroups;
2273	UInt uiCGPosX = uiCGBlkPos - (uiCGPosY * codingParameters.widthInGroups);
2274
2275	memset( &rdStats, 0, sizeof (coeffGroupRDStats));
2276
2277	const Int patternSigCtx = TComTrQuant::calcPatternSigCtx(uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups);
2278
2279	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
2280	{
2281	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
2282	//===== quantization =====
2283	UInt uiBlkPos = codingParameters.scan[iScanPos];
2284	// set coeff
2285
2286	const Int quantisationCoefficient = (enableScalingLists) ? piQCoef [uiBlkPos] : defaultQuantisationCoefficient;
2287	const Double errorScale = (enableScalingLists) ? pdErrScale[uiBlkPos] : defaultErrorScale;
2288
2289	const Int64 tmpLevel = Int64(abs(plSrcCoeff[ uiBlkPos ])) * quantisationCoefficient;
2290
2291	const Intermediate_Int lLevelDouble = (Intermediate_Int)min<Int64>(tmpLevel, MAX_INTERMEDIATE_INT - (Intermediate_Int(1) << (iQBits - 1)));
2292
2293	#if ADAPTIVE_QP_SELECTION
2294	if( m_bUseAdaptQpSelect )
2295	{
2296	piArlDstCoeff[uiBlkPos] = (TCoeff)(( lLevelDouble + iAddC) >> iQBitsC );
2297	}
2298	#endif
2299	const UInt uiMaxAbsLevel = std::min<UInt>(UInt(entropyCodingMaximum), UInt((lLevelDouble + (Intermediate_Int(1) << (iQBits - 1))) >> iQBits));
2300
2301	const Double dErr = Double( lLevelDouble );
2302	pdCostCoeff0[ iScanPos ] = dErr * dErr * errorScale;
2303	d64BlockUncodedCost += pdCostCoeff0[ iScanPos ];
2304	piDstCoeff[ uiBlkPos ] = uiMaxAbsLevel;
2305
2306	if ( uiMaxAbsLevel > 0 && iLastScanPos < 0 )
2307	{
2308	iLastScanPos = iScanPos;
2309	uiCtxSet = getContextSetIndex(compID, (iScanPos >> MLS_CG_SIZE), 0);
2310	iCGLastScanPos = iCGScanPos;
2311	}
2312
2313	if ( iLastScanPos >= 0 )
2314	{
2315	//===== coefficient level estimation =====
2316	UInt uiLevel;
2317	UInt uiOneCtx = (NUM_ONE_FLAG_CTX_PER_SET * uiCtxSet) + c1;
2318	UInt uiAbsCtx = (NUM_ABS_FLAG_CTX_PER_SET * uiCtxSet) + c2;
2319
2320	if( iScanPos == iLastScanPos )
2321	{
2322	uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
2323	lLevelDouble, uiMaxAbsLevel, significanceMapContextOffset, uiOneCtx, uiAbsCtx, uiGoRiceParam,
2324	c1Idx, c2Idx, iQBits, errorScale, 1, extendedPrecision, maxLog2TrDynamicRange
2325	);
2326	}
2327	else
2328	{
2329	UShort uiCtxSig = significanceMapContextOffset + getSigCtxInc( patternSigCtx, codingParameters, iScanPos, uiLog2BlockWidth, uiLog2BlockHeight, channelType );
2330
2331	uiLevel = xGetCodedLevel( pdCostCoeff[ iScanPos ], pdCostCoeff0[ iScanPos ], pdCostSig[ iScanPos ],
2332	lLevelDouble, uiMaxAbsLevel, uiCtxSig, uiOneCtx, uiAbsCtx, uiGoRiceParam,
2333	c1Idx, c2Idx, iQBits, errorScale, 0, extendedPrecision, maxLog2TrDynamicRange
2334	);
2335
2336	sigRateDelta[ uiBlkPos ] = m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 1 ] - m_pcEstBitsSbac->significantBits[ uiCtxSig ][ 0 ];
2337	}
2338
2339	deltaU[ uiBlkPos ] = TCoeff((lLevelDouble - (Intermediate_Int(uiLevel) << iQBits)) >> (iQBits-8));
2340
2341	if( uiLevel > 0 )
2342	{
2343	Int rateNow = xGetICRate( uiLevel, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange );
2344	rateIncUp [ uiBlkPos ] = xGetICRate( uiLevel+1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
2345	rateIncDown [ uiBlkPos ] = xGetICRate( uiLevel-1, uiOneCtx, uiAbsCtx, uiGoRiceParam, c1Idx, c2Idx, extendedPrecision, maxLog2TrDynamicRange ) - rateNow;
2346	}
2347	else // uiLevel == 0
2348	{
2349	rateIncUp [ uiBlkPos ] = m_pcEstBitsSbac->m_greaterOneBits[ uiOneCtx ][ 0 ];
2350	}
2351	piDstCoeff[ uiBlkPos ] = uiLevel;
2352	d64BaseCost += pdCostCoeff [ iScanPos ];
2353
2354	baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
2355	if( uiLevel >= baseLevel )
2356	{
2357	if (uiLevel > 3*(1<<uiGoRiceParam))
2358	{
2359	uiGoRiceParam = bUseGolombRiceParameterAdaptation ? (uiGoRiceParam + 1) : (std::min<UInt>((uiGoRiceParam + 1), 4));
2360	}
2361	}
2362	if ( uiLevel >= 1)
2363	{
2364	c1Idx ++;
2365	}
2366
2367	//===== update bin model =====
2368	if( uiLevel > 1 )
2369	{
2370	c1 = 0;
2371	c2 += (c2 < 2);
2372	c2Idx ++;
2373	}
2374	else if( (c1 < 3) && (c1 > 0) && uiLevel)
2375	{
2376	c1++;
2377	}
2378
2379	//===== context set update =====
2380	if( ( iScanPos % uiCGSize == 0 ) && ( iScanPos > 0 ) )
2381	{
2382	uiCtxSet = getContextSetIndex(compID, ((iScanPos - 1) >> MLS_CG_SIZE), (c1 == 0)); //(iScanPos - 1) because we do this before entering the final group
2383	c1 = 1;
2384	c2 = 0;
2385	c1Idx = 0;
2386	c2Idx = 0;
2387	uiGoRiceParam = initialGolombRiceParameter;
2388	}
2389	}
2390	else
2391	{
2392	d64BaseCost += pdCostCoeff0[ iScanPos ];
2393	}
2394	rdStats.d64SigCost += pdCostSig[ iScanPos ];
2395	if (iScanPosinCG == 0 )
2396	{
2397	rdStats.d64SigCost_0 = pdCostSig[ iScanPos ];
2398	}
2399	if (piDstCoeff[ uiBlkPos ] )
2400	{
2401	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 1;
2402	rdStats.d64CodedLevelandDist += pdCostCoeff[ iScanPos ] - pdCostSig[ iScanPos ];
2403	rdStats.d64UncodedDist += pdCostCoeff0[ iScanPos ];
2404	if ( iScanPosinCG != 0 )
2405	{
2406	rdStats.iNNZbeforePos0++;
2407	}
2408	}
2409	} //end for (iScanPosinCG)
2410
2411	if (iCGLastScanPos >= 0)
2412	{
2413	if( iCGScanPos )
2414	{
2415	if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
2416	{
2417	UInt uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
2418	d64BaseCost += xGetRateSigCoeffGroup(0, uiCtxSig) - rdStats.d64SigCost;;
2419	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
2420	}
2421	else
2422	{
2423	if (iCGScanPos < iCGLastScanPos) //skip the last coefficient group, which will be handled together with last position below.
2424	{
2425	if ( rdStats.iNNZbeforePos0 == 0 )
2426	{
2427	d64BaseCost -= rdStats.d64SigCost_0;
2428	rdStats.d64SigCost -= rdStats.d64SigCost_0;
2429	}
2430	// rd-cost if SigCoeffGroupFlag = 0, initialization
2431	Double d64CostZeroCG = d64BaseCost;
2432
2433	// add SigCoeffGroupFlag cost to total cost
2434	UInt uiCtxSig = getSigCoeffGroupCtxInc( uiSigCoeffGroupFlag, uiCGPosX, uiCGPosY, codingParameters.widthInGroups, codingParameters.heightInGroups );
2435
2436	if (iCGScanPos < iCGLastScanPos)
2437	{
2438	d64BaseCost += xGetRateSigCoeffGroup(1, uiCtxSig);
2439	d64CostZeroCG += xGetRateSigCoeffGroup(0, uiCtxSig);
2440	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(1, uiCtxSig);
2441	}
2442
2443	// try to convert the current coeff group from non-zero to all-zero
2444	d64CostZeroCG += rdStats.d64UncodedDist; // distortion for resetting non-zero levels to zero levels
2445	d64CostZeroCG -= rdStats.d64CodedLevelandDist; // distortion and level cost for keeping all non-zero levels
2446	d64CostZeroCG -= rdStats.d64SigCost; // sig cost for all coeffs, including zero levels and non-zerl levels
2447
2448	// if we can save cost, change this block to all-zero block
2449	if ( d64CostZeroCG < d64BaseCost )
2450	{
2451	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 0;
2452	d64BaseCost = d64CostZeroCG;
2453	if (iCGScanPos < iCGLastScanPos)
2454	{
2455	pdCostCoeffGroupSig[ iCGScanPos ] = xGetRateSigCoeffGroup(0, uiCtxSig);
2456	}
2457	// reset coeffs to 0 in this block
2458	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
2459	{
2460	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
2461	UInt uiBlkPos = codingParameters.scan[ iScanPos ];
2462
2463	if (piDstCoeff[ uiBlkPos ])
2464	{
2465	piDstCoeff [ uiBlkPos ] = 0;
2466	pdCostCoeff[ iScanPos ] = pdCostCoeff0[ iScanPos ];
2467	pdCostSig [ iScanPos ] = 0;
2468	}
2469	}
2470	} // end if ( d64CostAllZeros < d64BaseCost )
2471	}
2472	} // end if if (uiSigCoeffGroupFlag[ uiCGBlkPos ] == 0)
2473	}
2474	else
2475	{
2476	uiSigCoeffGroupFlag[ uiCGBlkPos ] = 1;
2477	}
2478	}
2479	} //end for (iCGScanPos)
2480
2481	//===== estimate last position =====
2482	if ( iLastScanPos < 0 )
2483	{
2484	return;
2485	}
2486
2487	Double d64BestCost = 0;
2488	Int ui16CtxCbf = 0;
2489	Int iBestLastIdxP1 = 0;
2490	if( !pcCU->isIntra( uiAbsPartIdx ) && isLuma(compID) && pcCU->getTransformIdx( uiAbsPartIdx ) == 0 )
2491	{
2492	ui16CtxCbf = 0;
2493	d64BestCost = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockRootCbpBits[ ui16CtxCbf ][ 0 ] );
2494	d64BaseCost += xGetICost( m_pcEstBitsSbac->blockRootCbpBits[ ui16CtxCbf ][ 1 ] );
2495	}
2496	else
2497	{
2498	ui16CtxCbf = pcCU->getCtxQtCbf( rTu, channelType );
2499	ui16CtxCbf += getCBFContextOffset(compID);
2500	d64BestCost = d64BlockUncodedCost + xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 0 ] );
2501	d64BaseCost += xGetICost( m_pcEstBitsSbac->blockCbpBits[ ui16CtxCbf ][ 1 ] );
2502	}
2503
2504
2505	Bool bFoundLast = false;
2506	for (Int iCGScanPos = iCGLastScanPos; iCGScanPos >= 0; iCGScanPos--)
2507	{
2508	UInt uiCGBlkPos = codingParameters.scanCG[ iCGScanPos ];
2509
2510	d64BaseCost -= pdCostCoeffGroupSig [ iCGScanPos ];
2511	if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
2512	{
2513	for (Int iScanPosinCG = uiCGSize-1; iScanPosinCG >= 0; iScanPosinCG--)
2514	{
2515	iScanPos = iCGScanPos*uiCGSize + iScanPosinCG;
2516
2517	if (iScanPos > iLastScanPos)
2518	{
2519	continue;
2520	}
2521	UInt uiBlkPos = codingParameters.scan[iScanPos];
2522
2523	if( piDstCoeff[ uiBlkPos ] )
2524	{
2525	UInt uiPosY = uiBlkPos >> uiLog2BlockWidth;
2526	UInt uiPosX = uiBlkPos - ( uiPosY << uiLog2BlockWidth );
2527
2528	Double d64CostLast= codingParameters.scanType == SCAN_VER ? xGetRateLast( uiPosY, uiPosX, compID ) : xGetRateLast( uiPosX, uiPosY, compID );
2529	Double totalCost = d64BaseCost + d64CostLast - pdCostSig[ iScanPos ];
2530
2531	if( totalCost < d64BestCost )
2532	{
2533	iBestLastIdxP1 = iScanPos + 1;
2534	d64BestCost = totalCost;
2535	}
2536	if( piDstCoeff[ uiBlkPos ] > 1 )
2537	{
2538	bFoundLast = true;
2539	break;
2540	}
2541	d64BaseCost -= pdCostCoeff[ iScanPos ];
2542	d64BaseCost += pdCostCoeff0[ iScanPos ];
2543	}
2544	else
2545	{
2546	d64BaseCost -= pdCostSig[ iScanPos ];
2547	}
2548	} //end for
2549	if (bFoundLast)
2550	{
2551	break;
2552	}
2553	} // end if (uiSigCoeffGroupFlag[ uiCGBlkPos ])
2554	} // end for
2555
2556
2557	for ( Int scanPos = 0; scanPos < iBestLastIdxP1; scanPos++ )
2558	{
2559	Int blkPos = codingParameters.scan[ scanPos ];
2560	TCoeff level = piDstCoeff[ blkPos ];
2561	uiAbsSum += level;
2562	piDstCoeff[ blkPos ] = ( plSrcCoeff[ blkPos ] < 0 ) ? -level : level;
2563	}
2564
2565	//===== clean uncoded coefficients =====
2566	for ( Int scanPos = iBestLastIdxP1; scanPos <= iLastScanPos; scanPos++ )
2567	{
2568	piDstCoeff[ codingParameters.scan[ scanPos ] ] = 0;
2569	}
2570
2571
2572	if( pcCU->getSlice()->getPPS()->getSignHideFlag() && uiAbsSum>=2)
2573	{
2574	const Double inverseQuantScale = Double(g_invQuantScales[cQP.rem]);
2575	Int64 rdFactor = (Int64)(inverseQuantScale * inverseQuantScale * (1 << (2 * cQP.per))
2576	/ m_dLambda / 16 / (1 << (2 * DISTORTION_PRECISION_ADJUSTMENT(channelBitDepth - 8)))
2577	+ 0.5);
2578
2579	Int lastCG = -1;
2580	Int absSum = 0 ;
2581	Int n ;
2582
2583	for( Int subSet = (uiWidth*uiHeight-1) >> MLS_CG_SIZE; subSet >= 0; subSet-- )
2584	{
2585	Int subPos = subSet << MLS_CG_SIZE;
2586	Int firstNZPosInCG=uiCGSize , lastNZPosInCG=-1 ;
2587	absSum = 0 ;
2588
2589	for(n = uiCGSize-1; n >= 0; --n )
2590	{
2591	if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
2592	{
2593	lastNZPosInCG = n;
2594	break;
2595	}
2596	}
2597
2598	for(n = 0; n <uiCGSize; n++ )
2599	{
2600	if( piDstCoeff[ codingParameters.scan[ n + subPos ]] )
2601	{
2602	firstNZPosInCG = n;
2603	break;
2604	}
2605	}
2606
2607	for(n = firstNZPosInCG; n <=lastNZPosInCG; n++ )
2608	{
2609	absSum += Int(piDstCoeff[ codingParameters.scan[ n + subPos ]]);
2610	}
2611
2612	if(lastNZPosInCG>=0 && lastCG==-1)
2613	{
2614	lastCG = 1;
2615	}
2616
2617	if( lastNZPosInCG-firstNZPosInCG>=SBH_THRESHOLD )
2618	{
2619	UInt signbit = (piDstCoeff[codingParameters.scan[subPos+firstNZPosInCG]]>0?0:1);
2620	if( signbit!=(absSum&0x1) ) // hide but need tune
2621	{
2622	// calculate the cost
2623	Int64 minCostInc = MAX_INT64, curCost = MAX_INT64;
2624	Int minPos = -1, finalChange = 0, curChange = 0;
2625
2626	for( n = (lastCG==1?lastNZPosInCG:uiCGSize-1) ; n >= 0; --n )
2627	{
2628	UInt uiBlkPos = codingParameters.scan[ n + subPos ];
2629	if(piDstCoeff[ uiBlkPos ] != 0 )
2630	{
2631	Int64 costUp = rdFactor * ( - deltaU[uiBlkPos] ) + rateIncUp[uiBlkPos];
2632	Int64 costDown = rdFactor * ( deltaU[uiBlkPos] ) + rateIncDown[uiBlkPos]
2633	- ((abs(piDstCoeff[uiBlkPos]) == 1) ? sigRateDelta[uiBlkPos] : 0);
2634
2635	if(lastCG==1 && lastNZPosInCG==n && abs(piDstCoeff[uiBlkPos])==1)
2636	{
2637	costDown -= (4<<15);
2638	}
2639
2640	if(costUp<costDown)
2641	{
2642	curCost = costUp;
2643	curChange = 1;
2644	}
2645	else
2646	{
2647	curChange = -1;
2648	if(n==firstNZPosInCG && abs(piDstCoeff[uiBlkPos])==1)
2649	{
2650	curCost = MAX_INT64;
2651	}
2652	else
2653	{
2654	curCost = costDown;
2655	}
2656	}
2657	}
2658	else
2659	{
2660	curCost = rdFactor * ( - (abs(deltaU[uiBlkPos])) ) + (1<<15) + rateIncUp[uiBlkPos] + sigRateDelta[uiBlkPos] ;
2661	curChange = 1 ;
2662
2663	if(n<firstNZPosInCG)
2664	{
2665	UInt thissignbit = (plSrcCoeff[uiBlkPos]>=0?0:1);
2666	if(thissignbit != signbit )
2667	{
2668	curCost = MAX_INT64;
2669	}
2670	}
2671	}
2672
2673	if( curCost<minCostInc)
2674	{
2675	minCostInc = curCost;
2676	finalChange = curChange;
2677	minPos = uiBlkPos;
2678	}
2679	}
2680
2681	if(piDstCoeff[minPos] == entropyCodingMaximum \|\| piDstCoeff[minPos] == entropyCodingMinimum)
2682	{
2683	finalChange = -1;
2684	}
2685
2686	if(plSrcCoeff[minPos]>=0)
2687	{
2688	piDstCoeff[minPos] += finalChange ;
2689	}
2690	else
2691	{
2692	piDstCoeff[minPos] -= finalChange ;
2693	}
2694	}
2695	}
2696
2697	if(lastCG==1)
2698	{
2699	lastCG=0 ;
2700	}
2701	}
2702	}
2703	}
2704
2705
2706	/** Pattern decision for context derivation process of significant_coeff_flag
2707	* \param sigCoeffGroupFlag pointer to prior coded significant coeff group
2708	* \param uiCGPosX column of current coefficient group
2709	* \param uiCGPosY row of current coefficient group
2710	* \param widthInGroups width of the block
2711	* \param heightInGroups height of the block
2712	* \returns pattern for current coefficient group
2713	*/
2714	Int TComTrQuant::calcPatternSigCtx( const UInt* sigCoeffGroupFlag, UInt uiCGPosX, UInt uiCGPosY, UInt widthInGroups, UInt heightInGroups )
2715	{
2716	if ((widthInGroups <= 1) && (heightInGroups <= 1))
2717	{
2718	return 0;
2719	}
2720
2721	const Bool rightAvailable = uiCGPosX < (widthInGroups - 1);
2722	const Bool belowAvailable = uiCGPosY < (heightInGroups - 1);
2723
2724	UInt sigRight = 0;
2725	UInt sigLower = 0;
2726
2727	if (rightAvailable)
2728	{
2729	sigRight = ((sigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
2730	}
2731	if (belowAvailable)
2732	{
2733	sigLower = ((sigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
2734	}
2735
2736	return sigRight + (sigLower << 1);
2737	}
2738
2739
2740	/** Context derivation process of coeff_abs_significant_flag
2741	* \param patternSigCtx pattern for current coefficient group
2742	* \param codingParameters coding parameters for the TU (includes the scan)
2743	* \param scanPosition current position in scan order
2744	* \param log2BlockWidth log2 width of the block
2745	* \param log2BlockHeight log2 height of the block
2746	* \param chanType channel type (CHANNEL_TYPE_LUMA/CHROMA)
2747	* \returns ctxInc for current scan position
2748	*/
2749	Int TComTrQuant::getSigCtxInc ( Int patternSigCtx,
2750	const TUEntropyCodingParameters &codingParameters,
2751	const Int scanPosition,
2752	const Int log2BlockWidth,
2753	const Int log2BlockHeight,
2754	const ChannelType chanType)
2755	{
2756	if (codingParameters.firstSignificanceMapContext == significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE])
2757	{
2758	//single context mode
2759	return significanceMapContextSetStart[chanType][CONTEXT_TYPE_SINGLE];
2760	}
2761
2762	const UInt rasterPosition = codingParameters.scan[scanPosition];
2763	const UInt posY = rasterPosition >> log2BlockWidth;
2764	const UInt posX = rasterPosition - (posY << log2BlockWidth);
2765
2766	if ((posX + posY) == 0)
2767	{
2768	return 0; //special case for the DC context variable
2769	}
2770
2771	Int offset = MAX_INT;
2772
2773	if ((log2BlockWidth == 2) && (log2BlockHeight == 2)) //4x4
2774	{
2775	offset = ctxIndMap4x4[ (4 * posY) + posX ];
2776	}
2777	else
2778	{
2779	Int cnt = 0;
2780
2781	switch (patternSigCtx)
2782	{
2783	//------------------
2784
2785	case 0: //neither neighbouring group is significant
2786	{
2787	const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH) - 1);
2788	const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
2789	const Int posTotalInSubset = posXinSubset + posYinSubset;
2790
2791	//first N coefficients in scan order use 2; the next few use 1; the rest use 0.
2792	const UInt context1Threshold = NEIGHBOURHOOD_00_CONTEXT_1_THRESHOLD_4x4;
2793	const UInt context2Threshold = NEIGHBOURHOOD_00_CONTEXT_2_THRESHOLD_4x4;
2794
2795	cnt = (posTotalInSubset >= context1Threshold) ? 0 : ((posTotalInSubset >= context2Threshold) ? 1 : 2);
2796	}
2797	break;
2798
2799	//------------------
2800
2801	case 1: //right group is significant, below is not
2802	{
2803	const Int posYinSubset = posY & ((1 << MLS_CG_LOG2_HEIGHT) - 1);
2804	const Int groupHeight = 1 << MLS_CG_LOG2_HEIGHT;
2805
2806	cnt = (posYinSubset >= (groupHeight >> 1)) ? 0 : ((posYinSubset >= (groupHeight >> 2)) ? 1 : 2); //top quarter uses 2; second-from-top quarter uses 1; bottom half uses 0
2807	}
2808	break;
2809
2810	//------------------
2811
2812	case 2: //below group is significant, right is not
2813	{
2814	const Int posXinSubset = posX & ((1 << MLS_CG_LOG2_WIDTH) - 1);
2815	const Int groupWidth = 1 << MLS_CG_LOG2_WIDTH;
2816
2817	cnt = (posXinSubset >= (groupWidth >> 1)) ? 0 : ((posXinSubset >= (groupWidth >> 2)) ? 1 : 2); //left quarter uses 2; second-from-left quarter uses 1; right half uses 0
2818	}
2819	break;
2820
2821	//------------------
2822
2823	case 3: //both neighbouring groups are significant
2824	{
2825	cnt = 2;
2826	}
2827	break;
2828
2829	//------------------
2830
2831	default:
2832	std::cerr << "ERROR: Invalid patternSigCtx \"" << Int(patternSigCtx) << "\" in getSigCtxInc" << std::endl;
2833	exit(1);
2834	break;
2835	}
2836
2837	//------------------------------------------------
2838
2839	const Bool notFirstGroup = ((posX >> MLS_CG_LOG2_WIDTH) + (posY >> MLS_CG_LOG2_HEIGHT)) > 0;
2840
2841	offset = (notFirstGroup ? notFirstGroupNeighbourhoodContextOffset[chanType] : 0) + cnt;
2842	}
2843
2844	return codingParameters.firstSignificanceMapContext + offset;
2845	}
2846
2847
2848	/** Get the best level in RD sense
2849	*
2850	* \returns best quantized transform level for given scan position
2851	*
2852	* This method calculates the best quantized transform level for a given scan position.
2853	*/
2854	__inline UInt TComTrQuant::xGetCodedLevel ( Double& rd64CodedCost, //< reference to coded cost
2855	Double& rd64CodedCost0, //< reference to cost when coefficient is 0
2856	Double& rd64CodedCostSig, //< rd64CodedCostSig reference to cost of significant coefficient
2857	Intermediate_Int lLevelDouble, //< reference to unscaled quantized level
2858	UInt uiMaxAbsLevel, //< scaled quantized level
2859	UShort ui16CtxNumSig, //< current ctxInc for coeff_abs_significant_flag
2860	UShort ui16CtxNumOne, //< current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
2861	UShort ui16CtxNumAbs, //< current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
2862	UShort ui16AbsGoRice, //< current Rice parameter for coeff_abs_level_minus3
2863	UInt c1Idx, //<
2864	UInt c2Idx, //<
2865	Int iQBits, //< quantization step size
2866	Double errorScale, //<
2867	Bool bLast, //< indicates if the coefficient is the last significant
2868	Bool useLimitedPrefixLength, //<
2869	const Int maxLog2TrDynamicRange //<
2870	) const
2871	{
2872	Double dCurrCostSig = 0;
2873	UInt uiBestAbsLevel = 0;
2874
2875	if( !bLast && uiMaxAbsLevel < 3 )
2876	{
2877	rd64CodedCostSig = xGetRateSigCoef( 0, ui16CtxNumSig );
2878	rd64CodedCost = rd64CodedCost0 + rd64CodedCostSig;
2879	if( uiMaxAbsLevel == 0 )
2880	{
2881	return uiBestAbsLevel;
2882	}
2883	}
2884	else
2885	{
2886	rd64CodedCost = MAX_DOUBLE;
2887	}
2888
2889	if( !bLast )
2890	{
2891	dCurrCostSig = xGetRateSigCoef( 1, ui16CtxNumSig );
2892	}
2893
2894	UInt uiMinAbsLevel = ( uiMaxAbsLevel > 1 ? uiMaxAbsLevel - 1 : 1 );
2895	for( Int uiAbsLevel = uiMaxAbsLevel; uiAbsLevel >= uiMinAbsLevel ; uiAbsLevel-- )
2896	{
2897	Double dErr = Double( lLevelDouble - ( Intermediate_Int(uiAbsLevel) << iQBits ) );
2898	Double dCurrCost = dErr * dErr * errorScale + xGetICost( xGetICRate( uiAbsLevel, ui16CtxNumOne, ui16CtxNumAbs, ui16AbsGoRice, c1Idx, c2Idx, useLimitedPrefixLength, maxLog2TrDynamicRange ) );
2899	dCurrCost += dCurrCostSig;
2900
2901	if( dCurrCost < rd64CodedCost )
2902	{
2903	uiBestAbsLevel = uiAbsLevel;
2904	rd64CodedCost = dCurrCost;
2905	rd64CodedCostSig = dCurrCostSig;
2906	}
2907	}
2908
2909	return uiBestAbsLevel;
2910	}
2911
2912	/** Calculates the cost for specific absolute transform level
2913	* \param uiAbsLevel scaled quantized level
2914	* \param ui16CtxNumOne current ctxInc for coeff_abs_level_greater1 (1st bin of coeff_abs_level_minus1 in AVC)
2915	* \param ui16CtxNumAbs current ctxInc for coeff_abs_level_greater2 (remaining bins of coeff_abs_level_minus1 in AVC)
2916	* \param ui16AbsGoRice Rice parameter for coeff_abs_level_minus3
2917	* \param c1Idx
2918	* \param c2Idx
2919	* \param useLimitedPrefixLength
2920	* \param maxLog2TrDynamicRange
2921	* \returns cost of given absolute transform level
2922	*/
2923	__inline Int TComTrQuant::xGetICRate ( const UInt uiAbsLevel,
2924	const UShort ui16CtxNumOne,
2925	const UShort ui16CtxNumAbs,
2926	const UShort ui16AbsGoRice,
2927	const UInt c1Idx,
2928	const UInt c2Idx,
2929	const Bool useLimitedPrefixLength,
2930	const Int maxLog2TrDynamicRange
2931	) const
2932	{
2933	Int iRate = Int(xGetIEPRate()); // cost of sign bit
2934	UInt baseLevel = (c1Idx < C1FLAG_NUMBER) ? (2 + (c2Idx < C2FLAG_NUMBER)) : 1;
2935
2936	if ( uiAbsLevel >= baseLevel )
2937	{
2938	UInt symbol = uiAbsLevel - baseLevel;
2939	UInt length;
2940	if (symbol < (COEF_REMAIN_BIN_REDUCTION << ui16AbsGoRice))
2941	{
2942	length = symbol>>ui16AbsGoRice;
2943	iRate += (length+1+ui16AbsGoRice)<< 15;
2944	}
2945	else if (useLimitedPrefixLength)
2946	{
2947	const UInt maximumPrefixLength = (32 - (COEF_REMAIN_BIN_REDUCTION + maxLog2TrDynamicRange));
2948
2949	UInt prefixLength = 0;
2950	UInt suffix = (symbol >> ui16AbsGoRice) - COEF_REMAIN_BIN_REDUCTION;
2951
2952	while ((prefixLength < maximumPrefixLength) && (suffix > ((2 << prefixLength) - 2)))
2953	{
2954	prefixLength++;
2955	}
2956
2957	const UInt suffixLength = (prefixLength == maximumPrefixLength) ? (maxLog2TrDynamicRange - ui16AbsGoRice) : (prefixLength + 1/separator/);
2958
2959	iRate += (COEF_REMAIN_BIN_REDUCTION + prefixLength + suffixLength + ui16AbsGoRice) << 15;
2960	}
2961	else
2962	{
2963	length = ui16AbsGoRice;
2964	symbol = symbol - ( COEF_REMAIN_BIN_REDUCTION << ui16AbsGoRice);
2965	while (symbol >= (1<<length))
2966	{
2967	symbol -= (1<<(length++));
2968	}
2969	iRate += (COEF_REMAIN_BIN_REDUCTION+length+1-ui16AbsGoRice+length)<< 15;
2970	}
2971
2972	if (c1Idx < C1FLAG_NUMBER)
2973	{
2974	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 1 ];
2975
2976	if (c2Idx < C2FLAG_NUMBER)
2977	{
2978	iRate += m_pcEstBitsSbac->m_levelAbsBits[ ui16CtxNumAbs ][ 1 ];
2979	}
2980	}
2981	}
2982	else if( uiAbsLevel == 1 )
2983	{
2984	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 0 ];
2985	}
2986	else if( uiAbsLevel == 2 )
2987	{
2988	iRate += m_pcEstBitsSbac->m_greaterOneBits[ ui16CtxNumOne ][ 1 ];
2989	iRate += m_pcEstBitsSbac->m_levelAbsBits[ ui16CtxNumAbs ][ 0 ];
2990	}
2991	else
2992	{
2993	iRate = 0;
2994	}
2995
2996	return iRate;
2997	}
2998
2999	__inline Double TComTrQuant::xGetRateSigCoeffGroup ( UShort uiSignificanceCoeffGroup,
3000	UShort ui16CtxNumSig ) const
3001	{
3002	return xGetICost( m_pcEstBitsSbac->significantCoeffGroupBits[ ui16CtxNumSig ][ uiSignificanceCoeffGroup ] );
3003	}
3004
3005	/** Calculates the cost of signaling the last significant coefficient in the block
3006	* \param uiPosX X coordinate of the last significant coefficient
3007	* \param uiPosY Y coordinate of the last significant coefficient
3008	* \param component colour component ID
3009	* \returns cost of last significant coefficient
3010	*/
3011	/*
3012	* \param uiWidth width of the transform unit (TU)
3013	*/
3014	__inline Double TComTrQuant::xGetRateLast ( const UInt uiPosX,
3015	const UInt uiPosY,
3016	const ComponentID component ) const
3017	{
3018	UInt uiCtxX = g_uiGroupIdx[uiPosX];
3019	UInt uiCtxY = g_uiGroupIdx[uiPosY];
3020
3021	Double uiCost = m_pcEstBitsSbac->lastXBits[toChannelType(component)][ uiCtxX ] + m_pcEstBitsSbac->lastYBits[toChannelType(component)][ uiCtxY ];
3022
3023	if( uiCtxX > 3 )
3024	{
3025	uiCost += xGetIEPRate() * ((uiCtxX-2)>>1);
3026	}
3027	if( uiCtxY > 3 )
3028	{
3029	uiCost += xGetIEPRate() * ((uiCtxY-2)>>1);
3030	}
3031	return xGetICost( uiCost );
3032	}
3033
3034	__inline Double TComTrQuant::xGetRateSigCoef ( UShort uiSignificance,
3035	UShort ui16CtxNumSig ) const
3036	{
3037	return xGetICost( m_pcEstBitsSbac->significantBits[ ui16CtxNumSig ][ uiSignificance ] );
3038	}
3039
3040	/** Get the cost for a specific rate
3041	* \param dRate rate of a bit
3042	* \returns cost at the specific rate
3043	*/
3044	__inline Double TComTrQuant::xGetICost ( Double dRate ) const
3045	{
3046	return m_dLambda * dRate;
3047	}
3048
3049	/** Get the cost of an equal probable bit
3050	* \returns cost of equal probable bit
3051	*/
3052	__inline Double TComTrQuant::xGetIEPRate ( ) const
3053	{
3054	return 32768;
3055	}
3056
3057	/** Context derivation process of coeff_abs_significant_flag
3058	* \param uiSigCoeffGroupFlag significance map of L1
3059	* \param uiCGPosX column of current scan position
3060	* \param uiCGPosY row of current scan position
3061	* \param widthInGroups width of the block
3062	* \param heightInGroups height of the block
3063	* \returns ctxInc for current scan position
3064	*/
3065	UInt TComTrQuant::getSigCoeffGroupCtxInc (const UInt* uiSigCoeffGroupFlag,
3066	const UInt uiCGPosX,
3067	const UInt uiCGPosY,
3068	const UInt widthInGroups,
3069	const UInt heightInGroups)
3070	{
3071	UInt sigRight = 0;
3072	UInt sigLower = 0;
3073
3074	if (uiCGPosX < (widthInGroups - 1))
3075	{
3076	sigRight = ((uiSigCoeffGroupFlag[ (uiCGPosY * widthInGroups) + uiCGPosX + 1 ] != 0) ? 1 : 0);
3077	}
3078	if (uiCGPosY < (heightInGroups - 1))
3079	{
3080	sigLower = ((uiSigCoeffGroupFlag[ (uiCGPosY + 1) * widthInGroups + uiCGPosX ] != 0) ? 1 : 0);
3081	}
3082
3083	return ((sigRight + sigLower) != 0) ? 1 : 0;
3084	}
3085
3086
3087	/** set quantized matrix coefficient for encode
3088	* \param scalingList quantized matrix address
3089	* \param format chroma format
3090	* \param maxLog2TrDynamicRange
3091	* \param bitDepths reference to bit depth array for all channels
3092	*/
3093	Void TComTrQuant::setScalingList(TComScalingList *scalingList, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
3094	{
3095	const Int minimumQp = 0;
3096	const Int maximumQp = SCALING_LIST_REM_NUM;
3097
3098	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
3099	{
3100	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
3101	{
3102	for(Int qp = minimumQp; qp < maximumQp; qp++)
3103	{
3104	xSetScalingListEnc(scalingList,list,size,qp);
3105	xSetScalingListDec(*scalingList,list,size,qp);
3106	setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
3107	}
3108	}
3109	}
3110	}
3111	/** set quantized matrix coefficient for decode
3112	* \param scalingList quantized matrix address
3113	* \param format chroma format
3114	*/
3115	Void TComTrQuant::setScalingListDec(const TComScalingList &scalingList)
3116	{
3117	const Int minimumQp = 0;
3118	const Int maximumQp = SCALING_LIST_REM_NUM;
3119
3120	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
3121	{
3122	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
3123	{
3124	for(Int qp = minimumQp; qp < maximumQp; qp++)
3125	{
3126	xSetScalingListDec(scalingList,list,size,qp);
3127	}
3128	}
3129	}
3130	}
3131	/** set error scale coefficients
3132	* \param list list ID
3133	* \param size
3134	* \param qp quantization parameter
3135	* \param maxLog2TrDynamicRange
3136	* \param bitDepths reference to bit depth array for all channels
3137	*/
3138	Void TComTrQuant::setErrScaleCoeff(UInt list, UInt size, Int qp, const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
3139	{
3140	const UInt uiLog2TrSize = g_aucConvertToBit[ g_scalingListSizeX[size] ] + 2;
3141	const ChannelType channelType = ((list == 0) \|\| (list == MAX_NUM_COMPONENT)) ? CHANNEL_TYPE_LUMA : CHANNEL_TYPE_CHROMA;
3142
3143	const Int channelBitDepth = bitDepths.recon[channelType];
3144	const Int iTransformShift = getTransformShift(channelBitDepth, uiLog2TrSize, maxLog2TrDynamicRange[channelType]); // Represents scaling through forward transform
3145
3146	UInt i,uiMaxNumCoeff = g_scalingListSize[size];
3147	Int *piQuantcoeff;
3148	Double *pdErrScale;
3149	piQuantcoeff = getQuantCoeff(list, qp,size);
3150	pdErrScale = getErrScaleCoeff(list, size, qp);
3151
3152	Double dErrScale = (Double)(1<<SCALE_BITS); // Compensate for scaling of bitcount in Lagrange cost function
3153	dErrScale = dErrScalepow(2.0,(-2.0iTransformShift)); // Compensate for scaling through forward transform
3154
3155	for(i=0;i<uiMaxNumCoeff;i++)
3156	{
3157	pdErrScale[i] = dErrScale / piQuantcoeff[i] / piQuantcoeff[i] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
3158	}
3159
3160	getErrScaleCoeffNoScalingList(list, size, qp) = dErrScale / g_quantScales[qp] / g_quantScales[qp] / (1 << DISTORTION_PRECISION_ADJUSTMENT(2 * (bitDepths.recon[channelType] - 8)));
3161	}
3162
3163	/** set quantized matrix coefficient for encode
3164	* \param scalingList quantized matrix address
3165	* \param listId List index
3166	* \param sizeId size index
3167	* \param qp Quantization parameter
3168	* \param format chroma format
3169	*/
3170	Void TComTrQuant::xSetScalingListEnc(TComScalingList *scalingList, UInt listId, UInt sizeId, Int qp)
3171	{
3172	UInt width = g_scalingListSizeX[sizeId];
3173	UInt height = g_scalingListSizeX[sizeId];
3174	UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
3175	Int *quantcoeff;
3176	Int *coeff = scalingList->getScalingListAddress(sizeId,listId);
3177	quantcoeff = getQuantCoeff(listId, qp, sizeId);
3178
3179	Int quantScales = g_quantScales[qp];
3180
3181	processScalingListEnc(coeff,
3182	quantcoeff,
3183	(quantScales << LOG2_SCALING_LIST_NEUTRAL_VALUE),
3184	height, width, ratio,
3185	min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
3186	scalingList->getScalingListDC(sizeId,listId));
3187	}
3188
3189	/** set quantized matrix coefficient for decode
3190	* \param scalingList quantaized matrix address
3191	* \param listId List index
3192	* \param sizeId size index
3193	* \param qp Quantization parameter
3194	* \param format chroma format
3195	*/
3196	Void TComTrQuant::xSetScalingListDec(const TComScalingList &scalingList, UInt listId, UInt sizeId, Int qp)
3197	{
3198	UInt width = g_scalingListSizeX[sizeId];
3199	UInt height = g_scalingListSizeX[sizeId];
3200	UInt ratio = g_scalingListSizeX[sizeId]/min(MAX_MATRIX_SIZE_NUM,(Int)g_scalingListSizeX[sizeId]);
3201	Int *dequantcoeff;
3202	const Int *coeff = scalingList.getScalingListAddress(sizeId,listId);
3203
3204	dequantcoeff = getDequantCoeff(listId, qp, sizeId);
3205
3206	Int invQuantScale = g_invQuantScales[qp];
3207
3208	processScalingListDec(coeff,
3209	dequantcoeff,
3210	invQuantScale,
3211	height, width, ratio,
3212	min(MAX_MATRIX_SIZE_NUM, (Int)g_scalingListSizeX[sizeId]),
3213	scalingList.getScalingListDC(sizeId,listId));
3214	}
3215
3216	/** set flat matrix value to quantized coefficient
3217	*/
3218	Void TComTrQuant::setFlatScalingList(const Int maxLog2TrDynamicRange[MAX_NUM_CHANNEL_TYPE], const BitDepths &bitDepths)
3219	{
3220	const Int minimumQp = 0;
3221	const Int maximumQp = SCALING_LIST_REM_NUM;
3222
3223	for(UInt size = 0; size < SCALING_LIST_SIZE_NUM; size++)
3224	{
3225	for(UInt list = 0; list < SCALING_LIST_NUM; list++)
3226	{
3227	for(Int qp = minimumQp; qp < maximumQp; qp++)
3228	{
3229	xsetFlatScalingList(list,size,qp);
3230	setErrScaleCoeff(list,size,qp,maxLog2TrDynamicRange, bitDepths);
3231	}
3232	}
3233	}
3234	}
3235
3236	/** set flat matrix value to quantized coefficient
3237	* \param list List ID
3238	* \param size size index
3239	* \param qp Quantization parameter
3240	* \param format chroma format
3241	*/
3242	Void TComTrQuant::xsetFlatScalingList(UInt list, UInt size, Int qp)
3243	{
3244	UInt i,num = g_scalingListSize[size];
3245	Int *quantcoeff;
3246	Int *dequantcoeff;
3247
3248	Int quantScales = g_quantScales [qp];
3249	Int invQuantScales = g_invQuantScales[qp] << 4;
3250
3251	quantcoeff = getQuantCoeff(list, qp, size);
3252	dequantcoeff = getDequantCoeff(list, qp, size);
3253
3254	for(i=0;i<num;i++)
3255	{
3256	*quantcoeff++ = quantScales;
3257	*dequantcoeff++ = invQuantScales;
3258	}
3259	}
3260
3261	/** set quantized matrix coefficient for encode
3262	* \param coeff quantaized matrix address
3263	* \param quantcoeff quantaized matrix address
3264	* \param quantScales Q(QP%6)
3265	* \param height height
3266	* \param width width
3267	* \param ratio ratio for upscale
3268	* \param sizuNum matrix size
3269	* \param dc dc parameter
3270	*/
3271	Void TComTrQuant::processScalingListEnc( Int coeff, Int quantcoeff, Int quantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
3272	{
3273	for(UInt j=0;j<height;j++)
3274	{
3275	for(UInt i=0;i<width;i++)
3276	{
3277	quantcoeff[jwidth + i] = quantScales / coeff[sizuNum (j / ratio) + i / ratio];
3278	}
3279	}
3280
3281	if(ratio > 1)
3282	{
3283	quantcoeff[0] = quantScales / dc;
3284	}
3285	}
3286
3287	/** set quantized matrix coefficient for decode
3288	* \param coeff quantaized matrix address
3289	* \param dequantcoeff quantaized matrix address
3290	* \param invQuantScales IQ(QP%6))
3291	* \param height height
3292	* \param width width
3293	* \param ratio ratio for upscale
3294	* \param sizuNum matrix size
3295	* \param dc dc parameter
3296	*/
3297	Void TComTrQuant::processScalingListDec( const Int coeff, Int dequantcoeff, Int invQuantScales, UInt height, UInt width, UInt ratio, Int sizuNum, UInt dc)
3298	{
3299	for(UInt j=0;j<height;j++)
3300	{
3301	for(UInt i=0;i<width;i++)
3302	{
3303	dequantcoeff[jwidth + i] = invQuantScales coeff[sizuNum * (j / ratio) + i / ratio];
3304	}
3305	}
3306
3307	if(ratio > 1)
3308	{
3309	dequantcoeff[0] = invQuantScales * dc;
3310	}
3311	}
3312
3313	/** initialization process of scaling list array
3314	*/
3315	Void TComTrQuant::initScalingList()
3316	{
3317	for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
3318	{
3319	for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
3320	{
3321	for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
3322	{
3323	m_quantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
3324	m_dequantCoef [sizeId][listId][qp] = new Int [g_scalingListSize[sizeId]];
3325	m_errScale [sizeId][listId][qp] = new Double [g_scalingListSize[sizeId]];
3326	} // listID loop
3327	}
3328	}
3329	}
3330
3331	/** destroy quantization matrix array
3332	*/
3333	Void TComTrQuant::destroyScalingList()
3334	{
3335	for(UInt sizeId = 0; sizeId < SCALING_LIST_SIZE_NUM; sizeId++)
3336	{
3337	for(UInt listId = 0; listId < SCALING_LIST_NUM; listId++)
3338	{
3339	for(UInt qp = 0; qp < SCALING_LIST_REM_NUM; qp++)
3340	{
3341	if(m_quantCoef[sizeId][listId][qp])
3342	{
3343	delete [] m_quantCoef[sizeId][listId][qp];
3344	}
3345	if(m_dequantCoef[sizeId][listId][qp])
3346	{
3347	delete [] m_dequantCoef[sizeId][listId][qp];
3348	}
3349	if(m_errScale[sizeId][listId][qp])
3350	{
3351	delete [] m_errScale[sizeId][listId][qp];
3352	}
3353	}
3354	}
3355	}
3356	}
3357
3358	Void TComTrQuant::transformSkipQuantOneSample(TComTU &rTu, const ComponentID compID, const TCoeff resiDiff, TCoeff* pcCoeff, const UInt uiPos, const QpParam &cQP, const Bool bUseHalfRoundingPoint)
3359	{
3360	TComDataCU *pcCU = rTu.getCU();
3361	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
3362	const TComRectangle &rect = rTu.getRect(compID);
3363	const UInt uiWidth = rect.width;
3364	const UInt uiHeight = rect.height;
3365	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
3366	#if SVC_EXTENSION
3367	const Int channelBitDepth = pcCU->getSlice()->getBitDepth(toChannelType(compID));
3368	#else
3369	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
3370	#endif
3371	const Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
3372	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
3373	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
3374	const Int defaultQuantisationCoefficient = g_quantScales[cQP.rem];
3375
3376	assert( scalingListType < SCALING_LIST_NUM );
3377	const Int *const piQuantCoeff = getQuantCoeff( scalingListType, cQP.rem, (rTu.GetEquivalentLog2TrSize(compID)-2) );
3378
3379
3380	/* for 422 chroma blocks, the effective scaling applied during transformation is not a power of 2, hence it cannot be
3381	* implemented as a bit-shift (the quantised result will be sqrt(2) * larger than required). Alternatively, adjust the
3382	* uiLog2TrSize applied in iTransformShift, such that the result is 1/sqrt(2) the required result (i.e. smaller)
3383	* Then a QP+3 (sqrt(2)) or QP-3 (1/sqrt(2)) method could be used to get the required result
3384	*/
3385
3386	const Int iQBits = QUANT_SHIFT + cQP.per + iTransformShift;
3387	// QBits will be OK for any internal bit depth as the reduction in transform shift is balanced by an increase in Qp_per due to QpBDOffset
3388
3389	const Int iAdd = ( bUseHalfRoundingPoint ? 256 : (pcCU->getSlice()->getSliceType() == I_SLICE ? 171 : 85) ) << (iQBits - 9);
3390
3391	TCoeff transformedCoefficient;
3392
3393	// transform-skip
3394	if (iTransformShift >= 0)
3395	{
3396	transformedCoefficient = resiDiff << iTransformShift;
3397	}
3398	else // for very high bit depths
3399	{
3400	const Int iTrShiftNeg = -iTransformShift;
3401	const Int offset = 1 << (iTrShiftNeg - 1);
3402	transformedCoefficient = ( resiDiff + offset ) >> iTrShiftNeg;
3403	}
3404
3405	// quantization
3406	const TCoeff iSign = (transformedCoefficient < 0 ? -1: 1);
3407
3408	const Int quantisationCoefficient = enableScalingLists ? piQuantCoeff[uiPos] : defaultQuantisationCoefficient;
3409
3410	const Int64 tmpLevel = (Int64)abs(transformedCoefficient) * quantisationCoefficient;
3411
3412	const TCoeff quantisedCoefficient = (TCoeff((tmpLevel + iAdd ) >> iQBits)) * iSign;
3413
3414	const TCoeff entropyCodingMinimum = -(1 << maxLog2TrDynamicRange);
3415	const TCoeff entropyCodingMaximum = (1 << maxLog2TrDynamicRange) - 1;
3416	pcCoeff[ uiPos ] = Clip3<TCoeff>( entropyCodingMinimum, entropyCodingMaximum, quantisedCoefficient );
3417	}
3418
3419
3420	Void TComTrQuant::invTrSkipDeQuantOneSample( TComTU &rTu, ComponentID compID, TCoeff inSample, Pel &reconSample, const QpParam &cQP, UInt uiPos )
3421	{
3422	TComDataCU *pcCU = rTu.getCU();
3423	const UInt uiAbsPartIdx = rTu.GetAbsPartIdxTU();
3424	const TComRectangle &rect = rTu.getRect(compID);
3425	const UInt uiWidth = rect.width;
3426	const UInt uiHeight = rect.height;
3427	const Int QP_per = cQP.per;
3428	const Int QP_rem = cQP.rem;
3429	const Int maxLog2TrDynamicRange = pcCU->getSlice()->getSPS()->getMaxLog2TrDynamicRange(toChannelType(compID));
3430	#if O0043_BEST_EFFORT_DECODING
3431	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getStreamBitDepth(toChannelType(compID));
3432	#else
3433	#if SVC_EXTENSION
3434	const Int channelBitDepth = pcCU->getSlice()->getBitDepth(toChannelType(compID));
3435	#else
3436	const Int channelBitDepth = pcCU->getSlice()->getSPS()->getBitDepth(toChannelType(compID));
3437	#endif
3438	#endif
3439	const Int iTransformShift = getTransformShift(channelBitDepth, rTu.GetEquivalentLog2TrSize(compID), maxLog2TrDynamicRange);
3440	const Int scalingListType = getScalingListType(pcCU->getPredictionMode(uiAbsPartIdx), compID);
3441	const Bool enableScalingLists = getUseScalingList(uiWidth, uiHeight, true);
3442	const UInt uiLog2TrSize = rTu.GetEquivalentLog2TrSize(compID);
3443
3444	assert( scalingListType < SCALING_LIST_NUM );
3445
3446	const Int rightShift = (IQUANT_SHIFT - (iTransformShift + QP_per)) + (enableScalingLists ? LOG2_SCALING_LIST_NEUTRAL_VALUE : 0);
3447
3448	const TCoeff transformMinimum = -(1 << maxLog2TrDynamicRange);
3449	const TCoeff transformMaximum = (1 << maxLog2TrDynamicRange) - 1;
3450
3451	// Dequantisation
3452
3453	TCoeff dequantisedSample;
3454
3455	if(enableScalingLists)
3456	{
3457	const UInt dequantCoefBits = 1 + IQUANT_SHIFT + SCALING_LIST_BITS;
3458	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - dequantCoefBits));
3459
3460	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
3461	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
3462
3463	Int *piDequantCoef = getDequantCoeff(scalingListType,QP_rem,uiLog2TrSize-2);
3464
3465	if(rightShift > 0)
3466	{
3467	const Intermediate_Int iAdd = 1 << (rightShift - 1);
3468	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
3469	const Intermediate_Int iCoeffQ = ((Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) + iAdd ) >> rightShift;
3470
3471	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
3472	}
3473	else
3474	{
3475	const Int leftShift = -rightShift;
3476	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
3477	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * piDequantCoef[uiPos]) << leftShift;
3478
3479	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
3480	}
3481	}
3482	else
3483	{
3484	const Int scale = g_invQuantScales[QP_rem];
3485	const Int scaleBits = (IQUANT_SHIFT + 1) ;
3486
3487	const UInt targetInputBitDepth = std::min<UInt>((maxLog2TrDynamicRange + 1), (((sizeof(Intermediate_Int) * 8) + rightShift) - scaleBits));
3488	const Intermediate_Int inputMinimum = -(1 << (targetInputBitDepth - 1));
3489	const Intermediate_Int inputMaximum = (1 << (targetInputBitDepth - 1)) - 1;
3490
3491	if (rightShift > 0)
3492	{
3493	const Intermediate_Int iAdd = 1 << (rightShift - 1);
3494	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
3495	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale + iAdd) >> rightShift;
3496
3497	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
3498	}
3499	else
3500	{
3501	const Int leftShift = -rightShift;
3502	const TCoeff clipQCoef = TCoeff(Clip3<Intermediate_Int>(inputMinimum, inputMaximum, inSample));
3503	const Intermediate_Int iCoeffQ = (Intermediate_Int(clipQCoef) * scale) << leftShift;
3504
3505	dequantisedSample = TCoeff(Clip3<Intermediate_Int>(transformMinimum,transformMaximum,iCoeffQ));
3506	}
3507	}
3508
3509	// Inverse transform-skip
3510
3511	if (iTransformShift >= 0)
3512	{
3513	const TCoeff offset = iTransformShift==0 ? 0 : (1 << (iTransformShift - 1));
3514	reconSample = Pel(( dequantisedSample + offset ) >> iTransformShift);
3515	}
3516	else //for very high bit depths
3517	{
3518	const Int iTrShiftNeg = -iTransformShift;
3519	reconSample = Pel(dequantisedSample << iTrShiftNeg);
3520	}
3521	}
3522
3523
3524	Void TComTrQuant::crossComponentPrediction( TComTU & rTu,
3525	const ComponentID compID,
3526	const Pel * piResiL,
3527	const Pel * piResiC,
3528	Pel * piResiT,
3529	const Int width,
3530	const Int height,
3531	const Int strideL,
3532	const Int strideC,
3533	const Int strideT,
3534	const Bool reverse )
3535	{
3536	const Pel *pResiL = piResiL;
3537	const Pel *pResiC = piResiC;
3538	Pel *pResiT = piResiT;
3539
3540	TComDataCU *pCU = rTu.getCU();
3541	const Int alpha = pCU->getCrossComponentPredictionAlpha( rTu.GetAbsPartIdxTU( compID ), compID );
3542	const Int diffBitDepth = pCU->getSlice()->getSPS()->getDifferentialLumaChromaBitDepth();
3543
3544	for( Int y = 0; y < height; y++ )
3545	{
3546	if (reverse)
3547	{
3548	// A constraint is to be added to the HEVC Standard to limit the size of pResiL and pResiC at this point.
3549	// The likely form of the constraint is to either restrict the values to CoeffMin to CoeffMax,
3550	// or to be representable in a bitDepthY+4 or bitDepthC+4 signed integer.
3551	// The result of the constraint is that for 8/10/12bit profiles, the input values
3552	// can be represented within a 16-bit Pel-type.
3553	#if RExt__HIGH_BIT_DEPTH_SUPPORT
3554	for( Int x = 0; x < width; x++ )
3555	{
3556	pResiT[x] = pResiC[x] + (( alpha * rightShift( pResiL[x], diffBitDepth) ) >> 3);
3557	}
3558	#else
3559	const Int minPel=std::numeric_limits<Pel>::min();
3560	const Int maxPel=std::numeric_limits<Pel>::max();
3561	for( Int x = 0; x < width; x++ )
3562	{
3563	pResiT[x] = Clip3<Int>(minPel, maxPel, pResiC[x] + (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3));
3564	}
3565	#endif
3566	}
3567	else
3568	{
3569	// Forward does not need clipping. Pel type should always be big enough.
3570	for( Int x = 0; x < width; x++ )
3571	{
3572	pResiT[x] = pResiC[x] - (( alpha * rightShift<Int>(Int(pResiL[x]), diffBitDepth) ) >> 3);
3573	}
3574	}
3575
3576	pResiL += strideL;
3577	pResiC += strideC;
3578	pResiT += strideT;
3579	}
3580	}
3581
3582	//! \}

Note: See TracBrowser for help on using the repository browser.

Download in other formats: