44 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
45 #include <emmintrin.h>
46 #include <xmmintrin.h>
89 return ((distortion * 65536.0) / lambda) + numBits;
93 return distortion + (((numBits * lambda) ) / 65536.0);
100 return (distortion / lambda) + numBits;
104 return distortion + (numBits * lambda);
179 assert(iVal != std::numeric_limits<Int>::min());
181 UInt uiTemp = ( iVal <= 0) ? (
UInt(-iVal)<<1)+1:
UInt(iVal<<1);
183 while ( 1 != uiTemp )
195 rcDistParam.
iCols = uiBlkWidth;
196 rcDistParam.
iRows = uiBlkHeight;
209 rcDistParam.
pCur = piRefY;
220 if (rcDistParam.
iCols == 12)
224 else if (rcDistParam.
iCols == 24)
228 else if (rcDistParam.
iCols == 48)
242 rcDistParam.
pCur = piRefY;
248 rcDistParam.
iStep = iStep;
260 if (rcDistParam.
iCols == 12)
264 else if (rcDistParam.
iCols == 24)
268 else if (rcDistParam.
iCols == 48)
289 rcDP.
iRows = iHeight;
302 if ( ( (iWidth % 8) == 0 ) && ( (iHeight % 8) == 0 ) )
304 for ( y=0; y<iHeight; y+= 8 )
306 for ( x=0; x<iWidth; x+= 8 )
308 uiSum +=
xCalcHADs8x8( &pi0[x], &pi1[x], iStride0, iStride1, 1
320 assert ( ( (iWidth % 4) == 0 ) && ( (iHeight % 4) == 0 ) );
322 for ( y=0; y<iHeight; y+= 4 )
324 for ( x=0; x<iWidth; x+= 4 )
326 uiSum +=
xCalcHADs4x4( &pi0[x], &pi1[x], iStride0, iStride1, 1 );
339 setDistParam( uiBlkWidth, uiBlkHeight, eDFunc, cDtParam );
340 cDtParam.
pOrg = piOrg;
341 cDtParam.
pCur = piCur;
356 return cDtParam.
DistFunc( &cDtParam );
364 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
365 inline Int simdSADLine4n16b(
const Pel * piOrg ,
const Pel * piCur ,
Int nWidth )
368 assert( !( nWidth & 0x03 ) );
369 __m128i org , cur , abs , sum;
370 sum = _mm_setzero_si128();
371 for(
Int n = 0 ; n < nWidth ; n += 4 )
373 org = _mm_loadl_epi64( ( __m128i* )( piOrg + n ) );
374 cur = _mm_loadl_epi64( ( __m128i* )( piCur + n ) );
375 abs = _mm_subs_epi16( _mm_max_epi16( org , cur ) , _mm_min_epi16( org , cur ) );
376 sum = _mm_adds_epu16( abs , sum );
378 __m128i zero = _mm_setzero_si128();
379 sum = _mm_unpacklo_epi16( sum , zero );
380 sum = _mm_add_epi32( sum , _mm_shuffle_epi32( sum , _MM_SHUFFLE( 2 , 3 , 0 , 1 ) ) );
381 sum = _mm_add_epi32( sum , _mm_shuffle_epi32( sum , _MM_SHUFFLE( 1 , 0 , 3 , 2 ) ) );
382 return( _mm_cvtsi128_si32( sum ) );
385 inline Int simdSADLine8n16b(
const Pel * piOrg ,
const Pel * piCur ,
Int nWidth )
388 assert( !( nWidth & 0x07 ) );
389 __m128i org , cur , abs , sum;
390 sum = _mm_setzero_si128();
391 for(
Int n = 0 ; n < nWidth ; n += 8 )
393 org = _mm_loadu_si128( ( __m128i* )( piOrg + n ) );
394 cur = _mm_loadu_si128( ( __m128i* )( piCur + n ) );
395 abs = _mm_subs_epi16( _mm_max_epi16( org , cur ) , _mm_min_epi16( org , cur ) );
396 sum = _mm_adds_epu16( abs , sum );
398 __m128i zero = _mm_setzero_si128();
399 __m128i hi = _mm_unpackhi_epi16( sum , zero );
400 __m128i lo = _mm_unpacklo_epi16( sum , zero );
401 sum = _mm_add_epi32( lo , hi );
402 sum = _mm_add_epi32( sum , _mm_shuffle_epi32( sum , _MM_SHUFFLE( 2 , 3 , 0 , 1 ) ) );
403 sum = _mm_add_epi32( sum , _mm_shuffle_epi32( sum , _MM_SHUFFLE( 1 , 0 , 3 , 2 ) ) );
404 return( _mm_cvtsi128_si32( sum ) );
407 inline Void simd8x8Transpose32b( __m128i * pBuffer )
410 for(
Int n = 0 ; n < 16 ; n++ )
412 tmp[n] = _mm_castsi128_ps( pBuffer[n] );
414 _MM_TRANSPOSE4_PS( tmp[0] , tmp[2] , tmp[4] , tmp[6] );
415 _MM_TRANSPOSE4_PS( tmp[1] , tmp[3] , tmp[5] , tmp[7] );
416 _MM_TRANSPOSE4_PS( tmp[8] , tmp[10] , tmp[12] , tmp[14] );
417 _MM_TRANSPOSE4_PS( tmp[9] , tmp[11] , tmp[13] , tmp[15] );
418 for(
Int n = 0 ; n < 8 ; n += 2 )
420 pBuffer[n] = _mm_castps_si128( tmp[n] );
421 pBuffer[n+1] = _mm_castps_si128( tmp[n+8] );
422 pBuffer[n+8] = _mm_castps_si128( tmp[n+1] );
423 pBuffer[n+9] = _mm_castps_si128( tmp[n+9] );
428 #define GCC_VERSION (__GNUC__ * 10000 + __GNUC_MINOR__ * 100 + __GNUC_PATCHLEVEL__)
429 #if GCC_VERSION > 40600 && GCC_VERSION < 40700
430 __attribute__((optimize(
"no-tree-vrp")))
433 Void simd8x8HAD1D32b( __m128i * pInput , __m128i * pOutput )
435 __m128i m1[8][2] , m2[8][2];
437 m2[0][0] = _mm_add_epi32( pInput[0] ,pInput[8 ] ); m2[0][1] = _mm_add_epi32( pInput[1] ,pInput[9 ] );
438 m2[1][0] = _mm_add_epi32( pInput[2] ,pInput[10] ); m2[1][1] = _mm_add_epi32( pInput[3] ,pInput[11] );
439 m2[2][0] = _mm_add_epi32( pInput[4] ,pInput[12] ); m2[2][1] = _mm_add_epi32( pInput[5] ,pInput[13] );
440 m2[3][0] = _mm_add_epi32( pInput[6] ,pInput[14] ); m2[3][1] = _mm_add_epi32( pInput[7] ,pInput[15] );
441 m2[4][0] = _mm_sub_epi32( pInput[0] ,pInput[8 ] ); m2[4][1] = _mm_sub_epi32( pInput[1] ,pInput[9 ] );
442 m2[5][0] = _mm_sub_epi32( pInput[2] ,pInput[10] ); m2[5][1] = _mm_sub_epi32( pInput[3] ,pInput[11] );
443 m2[6][0] = _mm_sub_epi32( pInput[4] ,pInput[12] ); m2[6][1] = _mm_sub_epi32( pInput[5] ,pInput[13] );
444 m2[7][0] = _mm_sub_epi32( pInput[6] ,pInput[14] ); m2[7][1] = _mm_sub_epi32( pInput[7] ,pInput[15] );
446 m1[0][0] = _mm_add_epi32( m2[0][0] , m2[2][0] ); m1[0][1] = _mm_add_epi32( m2[0][1] , m2[2][1] );
447 m1[1][0] = _mm_add_epi32( m2[1][0] , m2[3][0] ); m1[1][1] = _mm_add_epi32( m2[1][1] , m2[3][1] );
448 m1[2][0] = _mm_sub_epi32( m2[0][0] , m2[2][0] ); m1[2][1] = _mm_sub_epi32( m2[0][1] , m2[2][1] );
449 m1[3][0] = _mm_sub_epi32( m2[1][0] , m2[3][0] ); m1[3][1] = _mm_sub_epi32( m2[1][1] , m2[3][1] );
450 m1[4][0] = _mm_add_epi32( m2[4][0] , m2[6][0] ); m1[4][1] = _mm_add_epi32( m2[4][1] , m2[6][1] );
451 m1[5][0] = _mm_add_epi32( m2[5][0] , m2[7][0] ); m1[5][1] = _mm_add_epi32( m2[5][1] , m2[7][1] );
452 m1[6][0] = _mm_sub_epi32( m2[4][0] , m2[6][0] ); m1[6][1] = _mm_sub_epi32( m2[4][1] , m2[6][1] );
453 m1[7][0] = _mm_sub_epi32( m2[5][0] , m2[7][0] ); m1[7][1] = _mm_sub_epi32( m2[5][1] , m2[7][1] );
455 pInput[0 ] = _mm_add_epi32( m1[0][0] , m1[1][0] ); pInput[1 ] = _mm_add_epi32( m1[0][1] , m1[1][1] );
456 pInput[2 ] = _mm_sub_epi32( m1[0][0] , m1[1][0] ); pInput[3 ] = _mm_sub_epi32( m1[0][1] , m1[1][1] );
457 pInput[4 ] = _mm_add_epi32( m1[2][0] , m1[3][0] ); pInput[5 ] = _mm_add_epi32( m1[2][1] , m1[3][1] );
458 pInput[6 ] = _mm_sub_epi32( m1[2][0] , m1[3][0] ); pInput[7 ] = _mm_sub_epi32( m1[2][1] , m1[3][1] );
459 pInput[8 ] = _mm_add_epi32( m1[4][0] , m1[5][0] ); pInput[9 ] = _mm_add_epi32( m1[4][1] , m1[5][1] );
460 pInput[10] = _mm_sub_epi32( m1[4][0] , m1[5][0] ); pInput[11] = _mm_sub_epi32( m1[4][1] , m1[5][1] );
461 pInput[12] = _mm_add_epi32( m1[6][0] , m1[7][0] ); pInput[13] = _mm_add_epi32( m1[6][1] , m1[7][1] );
462 pInput[14] = _mm_sub_epi32( m1[6][0] , m1[7][0] ); pInput[15] = _mm_sub_epi32( m1[6][1] , m1[7][1] );
465 inline __m128i simdAbs32b( __m128i m )
467 const __m128i zero = _mm_setzero_si128();
468 __m128i tmp = _mm_sub_epi32( zero , m );
469 __m128i mask = _mm_cmpgt_epi32( m , tmp );
470 return( _mm_or_si128( _mm_and_si128( mask , m ) , _mm_andnot_si128( mask , tmp ) ) );
473 UInt simdHADs8x8(
const Pel * piOrg,
const Pel * piCur,
Int iStrideOrg,
Int iStrideCur )
475 __m128i mmDiff[8][2];
476 __m128i mmZero = _mm_setzero_si128();
477 for(
Int n = 0 ; n < 8 ; n++ , piOrg += iStrideOrg , piCur += iStrideCur )
479 __m128i diff = _mm_sub_epi16( _mm_loadu_si128( ( __m128i* )piOrg ) , _mm_loadu_si128( ( __m128i* )piCur ) );
481 __m128i mask = _mm_cmplt_epi16( diff , mmZero );
482 mmDiff[n][0] = _mm_unpacklo_epi16( diff , mask );
483 mmDiff[n][1] = _mm_unpackhi_epi16( diff , mask );
487 simd8x8Transpose32b( &mmDiff[0][0] );
490 simd8x8HAD1D32b( &mmDiff[0][0] , &mmDiff[0][0] );
493 simd8x8Transpose32b( &mmDiff[0][0] );
496 simd8x8HAD1D32b( &mmDiff[0][0] , &mmDiff[0][0] );
498 __m128i mmSum = _mm_setzero_si128();
499 for(
Int n = 0 ; n < 8 ; n++ )
501 mmSum = _mm_add_epi32( mmSum , simdAbs32b( mmDiff[n][0] ) );
502 mmSum = _mm_add_epi32( mmSum , simdAbs32b( mmDiff[n][1] ) );
504 mmSum = _mm_add_epi32( mmSum , _mm_shuffle_epi32( mmSum , _MM_SHUFFLE( 2 , 3 , 0 , 1 ) ) );
505 mmSum = _mm_add_epi32( mmSum , _mm_shuffle_epi32( mmSum , _MM_SHUFFLE( 1 , 0 , 3 , 2 ) ) );
507 UInt sad = _mm_cvtsi128_si32( mmSum );
508 sad = ( sad + 2 ) >> 2;
524 const Pel* piOrg = pcDtParam->
pOrg;
525 const Pel* piCur = pcDtParam->
pCur;
533 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
536 if( ( iCols & 0x07 ) == 0 )
538 for(
Int iRows = pcDtParam->
iRows ; iRows != 0; iRows-- )
540 uiSum += simdSADLine8n16b( piOrg , piCur , iCols );
547 for(
Int iRows = pcDtParam->
iRows; iRows != 0; iRows-- )
549 uiSum += simdSADLine4n16b( piOrg , piCur , iCols );
558 for(
Int iRows = pcDtParam->
iRows ; iRows != 0; iRows-- )
560 for (
Int n = 0; n < iCols; n++ )
562 uiSum += abs( piOrg[n] - piCur[n] );
566 return ( uiSum >> distortionShift );
571 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
575 return ( uiSum >> distortionShift );
584 const Pel* piOrg = pcDtParam->
pOrg;
585 const Pel* piCur = pcDtParam->
pCur;
588 Int iSubStep = ( 1 << iSubShift );
594 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
597 for( ; iRows != 0; iRows-=iSubStep )
599 uiSum += simdSADLine4n16b( piOrg , piCur , 4 );
607 for( ; iRows != 0; iRows-=iSubStep )
609 uiSum += abs( piOrg[0] - piCur[0] );
610 uiSum += abs( piOrg[1] - piCur[1] );
611 uiSum += abs( piOrg[2] - piCur[2] );
612 uiSum += abs( piOrg[3] - piCur[3] );
617 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
631 const Pel* piOrg = pcDtParam->
pOrg;
632 const Pel* piCur = pcDtParam->
pCur;
635 Int iSubStep = ( 1 << iSubShift );
641 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
644 for( ; iRows != 0; iRows-=iSubStep )
646 uiSum += simdSADLine8n16b( piOrg , piCur , 8 );
654 for( ; iRows != 0; iRows-=iSubStep )
656 uiSum += abs( piOrg[0] - piCur[0] );
657 uiSum += abs( piOrg[1] - piCur[1] );
658 uiSum += abs( piOrg[2] - piCur[2] );
659 uiSum += abs( piOrg[3] - piCur[3] );
660 uiSum += abs( piOrg[4] - piCur[4] );
661 uiSum += abs( piOrg[5] - piCur[5] );
662 uiSum += abs( piOrg[6] - piCur[6] );
663 uiSum += abs( piOrg[7] - piCur[7] );
668 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
682 const Pel* piOrg = pcDtParam->
pOrg;
683 const Pel* piCur = pcDtParam->
pCur;
686 Int iSubStep = ( 1 << iSubShift );
692 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
695 for( ; iRows != 0; iRows-=iSubStep )
697 uiSum += simdSADLine8n16b( piOrg , piCur , 16 );
705 for( ; iRows != 0; iRows-=iSubStep )
707 uiSum += abs( piOrg[0] - piCur[0] );
708 uiSum += abs( piOrg[1] - piCur[1] );
709 uiSum += abs( piOrg[2] - piCur[2] );
710 uiSum += abs( piOrg[3] - piCur[3] );
711 uiSum += abs( piOrg[4] - piCur[4] );
712 uiSum += abs( piOrg[5] - piCur[5] );
713 uiSum += abs( piOrg[6] - piCur[6] );
714 uiSum += abs( piOrg[7] - piCur[7] );
715 uiSum += abs( piOrg[8] - piCur[8] );
716 uiSum += abs( piOrg[9] - piCur[9] );
717 uiSum += abs( piOrg[10] - piCur[10] );
718 uiSum += abs( piOrg[11] - piCur[11] );
719 uiSum += abs( piOrg[12] - piCur[12] );
720 uiSum += abs( piOrg[13] - piCur[13] );
721 uiSum += abs( piOrg[14] - piCur[14] );
722 uiSum += abs( piOrg[15] - piCur[15] );
727 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
741 const Pel* piOrg = pcDtParam->
pOrg;
742 const Pel* piCur = pcDtParam->
pCur;
745 Int iSubStep = ( 1 << iSubShift );
751 for( ; iRows != 0; iRows-=iSubStep )
753 uiSum += abs( piOrg[0] - piCur[0] );
754 uiSum += abs( piOrg[1] - piCur[1] );
755 uiSum += abs( piOrg[2] - piCur[2] );
756 uiSum += abs( piOrg[3] - piCur[3] );
757 uiSum += abs( piOrg[4] - piCur[4] );
758 uiSum += abs( piOrg[5] - piCur[5] );
759 uiSum += abs( piOrg[6] - piCur[6] );
760 uiSum += abs( piOrg[7] - piCur[7] );
761 uiSum += abs( piOrg[8] - piCur[8] );
762 uiSum += abs( piOrg[9] - piCur[9] );
763 uiSum += abs( piOrg[10] - piCur[10] );
764 uiSum += abs( piOrg[11] - piCur[11] );
776 const Pel* piOrg = pcDtParam->
pOrg;
777 const Pel* piCur = pcDtParam->
pCur;
781 Int iSubStep = ( 1 << iSubShift );
787 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
790 for( ; iRows != 0; iRows-=iSubStep )
792 uiSum += simdSADLine8n16b( piOrg , piCur , iCols );
800 for( ; iRows != 0; iRows-=iSubStep )
802 for (
Int n = 0; n < iCols; n+=16 )
804 uiSum += abs( piOrg[n+ 0] - piCur[n+ 0] );
805 uiSum += abs( piOrg[n+ 1] - piCur[n+ 1] );
806 uiSum += abs( piOrg[n+ 2] - piCur[n+ 2] );
807 uiSum += abs( piOrg[n+ 3] - piCur[n+ 3] );
808 uiSum += abs( piOrg[n+ 4] - piCur[n+ 4] );
809 uiSum += abs( piOrg[n+ 5] - piCur[n+ 5] );
810 uiSum += abs( piOrg[n+ 6] - piCur[n+ 6] );
811 uiSum += abs( piOrg[n+ 7] - piCur[n+ 7] );
812 uiSum += abs( piOrg[n+ 8] - piCur[n+ 8] );
813 uiSum += abs( piOrg[n+ 9] - piCur[n+ 9] );
814 uiSum += abs( piOrg[n+10] - piCur[n+10] );
815 uiSum += abs( piOrg[n+11] - piCur[n+11] );
816 uiSum += abs( piOrg[n+12] - piCur[n+12] );
817 uiSum += abs( piOrg[n+13] - piCur[n+13] );
818 uiSum += abs( piOrg[n+14] - piCur[n+14] );
819 uiSum += abs( piOrg[n+15] - piCur[n+15] );
824 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
838 const Pel* piOrg = pcDtParam->
pOrg;
839 const Pel* piCur = pcDtParam->
pCur;
842 Int iSubStep = ( 1 << iSubShift );
848 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
851 for( ; iRows != 0; iRows-=iSubStep )
853 uiSum += simdSADLine8n16b( piOrg , piCur , 32 );
861 for( ; iRows != 0; iRows-=iSubStep )
863 uiSum += abs( piOrg[0] - piCur[0] );
864 uiSum += abs( piOrg[1] - piCur[1] );
865 uiSum += abs( piOrg[2] - piCur[2] );
866 uiSum += abs( piOrg[3] - piCur[3] );
867 uiSum += abs( piOrg[4] - piCur[4] );
868 uiSum += abs( piOrg[5] - piCur[5] );
869 uiSum += abs( piOrg[6] - piCur[6] );
870 uiSum += abs( piOrg[7] - piCur[7] );
871 uiSum += abs( piOrg[8] - piCur[8] );
872 uiSum += abs( piOrg[9] - piCur[9] );
873 uiSum += abs( piOrg[10] - piCur[10] );
874 uiSum += abs( piOrg[11] - piCur[11] );
875 uiSum += abs( piOrg[12] - piCur[12] );
876 uiSum += abs( piOrg[13] - piCur[13] );
877 uiSum += abs( piOrg[14] - piCur[14] );
878 uiSum += abs( piOrg[15] - piCur[15] );
879 uiSum += abs( piOrg[16] - piCur[16] );
880 uiSum += abs( piOrg[17] - piCur[17] );
881 uiSum += abs( piOrg[18] - piCur[18] );
882 uiSum += abs( piOrg[19] - piCur[19] );
883 uiSum += abs( piOrg[20] - piCur[20] );
884 uiSum += abs( piOrg[21] - piCur[21] );
885 uiSum += abs( piOrg[22] - piCur[22] );
886 uiSum += abs( piOrg[23] - piCur[23] );
887 uiSum += abs( piOrg[24] - piCur[24] );
888 uiSum += abs( piOrg[25] - piCur[25] );
889 uiSum += abs( piOrg[26] - piCur[26] );
890 uiSum += abs( piOrg[27] - piCur[27] );
891 uiSum += abs( piOrg[28] - piCur[28] );
892 uiSum += abs( piOrg[29] - piCur[29] );
893 uiSum += abs( piOrg[30] - piCur[30] );
894 uiSum += abs( piOrg[31] - piCur[31] );
899 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
913 const Pel* piOrg = pcDtParam->
pOrg;
914 const Pel* piCur = pcDtParam->
pCur;
917 Int iSubStep = ( 1 << iSubShift );
923 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
926 for( ; iRows != 0; iRows-=iSubStep )
928 uiSum += simdSADLine8n16b( piOrg , piCur , 24 );
936 for( ; iRows != 0; iRows-=iSubStep )
938 uiSum += abs( piOrg[0] - piCur[0] );
939 uiSum += abs( piOrg[1] - piCur[1] );
940 uiSum += abs( piOrg[2] - piCur[2] );
941 uiSum += abs( piOrg[3] - piCur[3] );
942 uiSum += abs( piOrg[4] - piCur[4] );
943 uiSum += abs( piOrg[5] - piCur[5] );
944 uiSum += abs( piOrg[6] - piCur[6] );
945 uiSum += abs( piOrg[7] - piCur[7] );
946 uiSum += abs( piOrg[8] - piCur[8] );
947 uiSum += abs( piOrg[9] - piCur[9] );
948 uiSum += abs( piOrg[10] - piCur[10] );
949 uiSum += abs( piOrg[11] - piCur[11] );
950 uiSum += abs( piOrg[12] - piCur[12] );
951 uiSum += abs( piOrg[13] - piCur[13] );
952 uiSum += abs( piOrg[14] - piCur[14] );
953 uiSum += abs( piOrg[15] - piCur[15] );
954 uiSum += abs( piOrg[16] - piCur[16] );
955 uiSum += abs( piOrg[17] - piCur[17] );
956 uiSum += abs( piOrg[18] - piCur[18] );
957 uiSum += abs( piOrg[19] - piCur[19] );
958 uiSum += abs( piOrg[20] - piCur[20] );
959 uiSum += abs( piOrg[21] - piCur[21] );
960 uiSum += abs( piOrg[22] - piCur[22] );
961 uiSum += abs( piOrg[23] - piCur[23] );
966 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
980 const Pel* piOrg = pcDtParam->
pOrg;
981 const Pel* piCur = pcDtParam->
pCur;
984 Int iSubStep = ( 1 << iSubShift );
990 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
993 for( ; iRows != 0; iRows-=iSubStep )
995 uiSum += simdSADLine8n16b( piOrg , piCur , 64 );
1003 for( ; iRows != 0; iRows-=iSubStep )
1005 uiSum += abs( piOrg[0] - piCur[0] );
1006 uiSum += abs( piOrg[1] - piCur[1] );
1007 uiSum += abs( piOrg[2] - piCur[2] );
1008 uiSum += abs( piOrg[3] - piCur[3] );
1009 uiSum += abs( piOrg[4] - piCur[4] );
1010 uiSum += abs( piOrg[5] - piCur[5] );
1011 uiSum += abs( piOrg[6] - piCur[6] );
1012 uiSum += abs( piOrg[7] - piCur[7] );
1013 uiSum += abs( piOrg[8] - piCur[8] );
1014 uiSum += abs( piOrg[9] - piCur[9] );
1015 uiSum += abs( piOrg[10] - piCur[10] );
1016 uiSum += abs( piOrg[11] - piCur[11] );
1017 uiSum += abs( piOrg[12] - piCur[12] );
1018 uiSum += abs( piOrg[13] - piCur[13] );
1019 uiSum += abs( piOrg[14] - piCur[14] );
1020 uiSum += abs( piOrg[15] - piCur[15] );
1021 uiSum += abs( piOrg[16] - piCur[16] );
1022 uiSum += abs( piOrg[17] - piCur[17] );
1023 uiSum += abs( piOrg[18] - piCur[18] );
1024 uiSum += abs( piOrg[19] - piCur[19] );
1025 uiSum += abs( piOrg[20] - piCur[20] );
1026 uiSum += abs( piOrg[21] - piCur[21] );
1027 uiSum += abs( piOrg[22] - piCur[22] );
1028 uiSum += abs( piOrg[23] - piCur[23] );
1029 uiSum += abs( piOrg[24] - piCur[24] );
1030 uiSum += abs( piOrg[25] - piCur[25] );
1031 uiSum += abs( piOrg[26] - piCur[26] );
1032 uiSum += abs( piOrg[27] - piCur[27] );
1033 uiSum += abs( piOrg[28] - piCur[28] );
1034 uiSum += abs( piOrg[29] - piCur[29] );
1035 uiSum += abs( piOrg[30] - piCur[30] );
1036 uiSum += abs( piOrg[31] - piCur[31] );
1037 uiSum += abs( piOrg[32] - piCur[32] );
1038 uiSum += abs( piOrg[33] - piCur[33] );
1039 uiSum += abs( piOrg[34] - piCur[34] );
1040 uiSum += abs( piOrg[35] - piCur[35] );
1041 uiSum += abs( piOrg[36] - piCur[36] );
1042 uiSum += abs( piOrg[37] - piCur[37] );
1043 uiSum += abs( piOrg[38] - piCur[38] );
1044 uiSum += abs( piOrg[39] - piCur[39] );
1045 uiSum += abs( piOrg[40] - piCur[40] );
1046 uiSum += abs( piOrg[41] - piCur[41] );
1047 uiSum += abs( piOrg[42] - piCur[42] );
1048 uiSum += abs( piOrg[43] - piCur[43] );
1049 uiSum += abs( piOrg[44] - piCur[44] );
1050 uiSum += abs( piOrg[45] - piCur[45] );
1051 uiSum += abs( piOrg[46] - piCur[46] );
1052 uiSum += abs( piOrg[47] - piCur[47] );
1053 uiSum += abs( piOrg[48] - piCur[48] );
1054 uiSum += abs( piOrg[49] - piCur[49] );
1055 uiSum += abs( piOrg[50] - piCur[50] );
1056 uiSum += abs( piOrg[51] - piCur[51] );
1057 uiSum += abs( piOrg[52] - piCur[52] );
1058 uiSum += abs( piOrg[53] - piCur[53] );
1059 uiSum += abs( piOrg[54] - piCur[54] );
1060 uiSum += abs( piOrg[55] - piCur[55] );
1061 uiSum += abs( piOrg[56] - piCur[56] );
1062 uiSum += abs( piOrg[57] - piCur[57] );
1063 uiSum += abs( piOrg[58] - piCur[58] );
1064 uiSum += abs( piOrg[59] - piCur[59] );
1065 uiSum += abs( piOrg[60] - piCur[60] );
1066 uiSum += abs( piOrg[61] - piCur[61] );
1067 uiSum += abs( piOrg[62] - piCur[62] );
1068 uiSum += abs( piOrg[63] - piCur[63] );
1070 piOrg += iStrideOrg;
1071 piCur += iStrideCur;
1073 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
1077 uiSum <<= iSubShift;
1087 const Pel* piOrg = pcDtParam->
pOrg;
1088 const Pel* piCur = pcDtParam->
pCur;
1091 Int iSubStep = ( 1 << iSubShift );
1097 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
1100 for( ; iRows != 0; iRows-=iSubStep )
1102 uiSum += simdSADLine8n16b( piOrg , piCur , 48 );
1103 piOrg += iStrideOrg;
1104 piCur += iStrideCur;
1110 for( ; iRows != 0; iRows-=iSubStep )
1112 uiSum += abs( piOrg[0] - piCur[0] );
1113 uiSum += abs( piOrg[1] - piCur[1] );
1114 uiSum += abs( piOrg[2] - piCur[2] );
1115 uiSum += abs( piOrg[3] - piCur[3] );
1116 uiSum += abs( piOrg[4] - piCur[4] );
1117 uiSum += abs( piOrg[5] - piCur[5] );
1118 uiSum += abs( piOrg[6] - piCur[6] );
1119 uiSum += abs( piOrg[7] - piCur[7] );
1120 uiSum += abs( piOrg[8] - piCur[8] );
1121 uiSum += abs( piOrg[9] - piCur[9] );
1122 uiSum += abs( piOrg[10] - piCur[10] );
1123 uiSum += abs( piOrg[11] - piCur[11] );
1124 uiSum += abs( piOrg[12] - piCur[12] );
1125 uiSum += abs( piOrg[13] - piCur[13] );
1126 uiSum += abs( piOrg[14] - piCur[14] );
1127 uiSum += abs( piOrg[15] - piCur[15] );
1128 uiSum += abs( piOrg[16] - piCur[16] );
1129 uiSum += abs( piOrg[17] - piCur[17] );
1130 uiSum += abs( piOrg[18] - piCur[18] );
1131 uiSum += abs( piOrg[19] - piCur[19] );
1132 uiSum += abs( piOrg[20] - piCur[20] );
1133 uiSum += abs( piOrg[21] - piCur[21] );
1134 uiSum += abs( piOrg[22] - piCur[22] );
1135 uiSum += abs( piOrg[23] - piCur[23] );
1136 uiSum += abs( piOrg[24] - piCur[24] );
1137 uiSum += abs( piOrg[25] - piCur[25] );
1138 uiSum += abs( piOrg[26] - piCur[26] );
1139 uiSum += abs( piOrg[27] - piCur[27] );
1140 uiSum += abs( piOrg[28] - piCur[28] );
1141 uiSum += abs( piOrg[29] - piCur[29] );
1142 uiSum += abs( piOrg[30] - piCur[30] );
1143 uiSum += abs( piOrg[31] - piCur[31] );
1144 uiSum += abs( piOrg[32] - piCur[32] );
1145 uiSum += abs( piOrg[33] - piCur[33] );
1146 uiSum += abs( piOrg[34] - piCur[34] );
1147 uiSum += abs( piOrg[35] - piCur[35] );
1148 uiSum += abs( piOrg[36] - piCur[36] );
1149 uiSum += abs( piOrg[37] - piCur[37] );
1150 uiSum += abs( piOrg[38] - piCur[38] );
1151 uiSum += abs( piOrg[39] - piCur[39] );
1152 uiSum += abs( piOrg[40] - piCur[40] );
1153 uiSum += abs( piOrg[41] - piCur[41] );
1154 uiSum += abs( piOrg[42] - piCur[42] );
1155 uiSum += abs( piOrg[43] - piCur[43] );
1156 uiSum += abs( piOrg[44] - piCur[44] );
1157 uiSum += abs( piOrg[45] - piCur[45] );
1158 uiSum += abs( piOrg[46] - piCur[46] );
1159 uiSum += abs( piOrg[47] - piCur[47] );
1161 piOrg += iStrideOrg;
1162 piCur += iStrideCur;
1164 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
1168 uiSum <<= iSubShift;
1182 const Pel* piOrg = pcDtParam->
pOrg;
1183 const Pel* piCur = pcDtParam->
pCur;
1194 for( ; iRows != 0; iRows-- )
1196 for (
Int n = 0; n < iCols; n++ )
1198 iTemp = piOrg[n ] - piCur[n ];
1199 uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1201 piOrg += iStrideOrg;
1202 piCur += iStrideCur;
1212 assert( pcDtParam->
iCols == 4 );
1215 const Pel* piOrg = pcDtParam->
pOrg;
1216 const Pel* piCur = pcDtParam->
pCur;
1226 for( ; iRows != 0; iRows-- )
1229 iTemp = piOrg[0] - piCur[0]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1230 iTemp = piOrg[1] - piCur[1]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1231 iTemp = piOrg[2] - piCur[2]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1232 iTemp = piOrg[3] - piCur[3]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1234 piOrg += iStrideOrg;
1235 piCur += iStrideCur;
1245 assert( pcDtParam->
iCols == 8 );
1248 const Pel* piOrg = pcDtParam->
pOrg;
1249 const Pel* piCur = pcDtParam->
pCur;
1259 for( ; iRows != 0; iRows-- )
1261 iTemp = piOrg[0] - piCur[0]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1262 iTemp = piOrg[1] - piCur[1]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1263 iTemp = piOrg[2] - piCur[2]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1264 iTemp = piOrg[3] - piCur[3]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1265 iTemp = piOrg[4] - piCur[4]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1266 iTemp = piOrg[5] - piCur[5]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1267 iTemp = piOrg[6] - piCur[6]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1268 iTemp = piOrg[7] - piCur[7]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1270 piOrg += iStrideOrg;
1271 piCur += iStrideCur;
1281 assert( pcDtParam->
iCols == 16 );
1284 const Pel* piOrg = pcDtParam->
pOrg;
1285 const Pel* piCur = pcDtParam->
pCur;
1295 for( ; iRows != 0; iRows-- )
1298 iTemp = piOrg[ 0] - piCur[ 0]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1299 iTemp = piOrg[ 1] - piCur[ 1]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1300 iTemp = piOrg[ 2] - piCur[ 2]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1301 iTemp = piOrg[ 3] - piCur[ 3]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1302 iTemp = piOrg[ 4] - piCur[ 4]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1303 iTemp = piOrg[ 5] - piCur[ 5]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1304 iTemp = piOrg[ 6] - piCur[ 6]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1305 iTemp = piOrg[ 7] - piCur[ 7]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1306 iTemp = piOrg[ 8] - piCur[ 8]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1307 iTemp = piOrg[ 9] - piCur[ 9]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1308 iTemp = piOrg[10] - piCur[10]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1309 iTemp = piOrg[11] - piCur[11]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1310 iTemp = piOrg[12] - piCur[12]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1311 iTemp = piOrg[13] - piCur[13]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1312 iTemp = piOrg[14] - piCur[14]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1313 iTemp = piOrg[15] - piCur[15]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1315 piOrg += iStrideOrg;
1316 piCur += iStrideCur;
1328 const Pel* piOrg = pcDtParam->
pOrg;
1329 const Pel* piCur = pcDtParam->
pCur;
1340 for( ; iRows != 0; iRows-- )
1342 for (
Int n = 0; n < iCols; n+=16 )
1345 iTemp = piOrg[n+ 0] - piCur[n+ 0]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1346 iTemp = piOrg[n+ 1] - piCur[n+ 1]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1347 iTemp = piOrg[n+ 2] - piCur[n+ 2]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1348 iTemp = piOrg[n+ 3] - piCur[n+ 3]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1349 iTemp = piOrg[n+ 4] - piCur[n+ 4]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1350 iTemp = piOrg[n+ 5] - piCur[n+ 5]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1351 iTemp = piOrg[n+ 6] - piCur[n+ 6]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1352 iTemp = piOrg[n+ 7] - piCur[n+ 7]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1353 iTemp = piOrg[n+ 8] - piCur[n+ 8]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1354 iTemp = piOrg[n+ 9] - piCur[n+ 9]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1355 iTemp = piOrg[n+10] - piCur[n+10]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1356 iTemp = piOrg[n+11] - piCur[n+11]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1357 iTemp = piOrg[n+12] - piCur[n+12]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1358 iTemp = piOrg[n+13] - piCur[n+13]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1359 iTemp = piOrg[n+14] - piCur[n+14]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1360 iTemp = piOrg[n+15] - piCur[n+15]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1363 piOrg += iStrideOrg;
1364 piCur += iStrideCur;
1374 assert( pcDtParam->
iCols == 32 );
1377 const Pel* piOrg = pcDtParam->
pOrg;
1378 const Pel* piCur = pcDtParam->
pCur;
1388 for( ; iRows != 0; iRows-- )
1391 iTemp = piOrg[ 0] - piCur[ 0]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1392 iTemp = piOrg[ 1] - piCur[ 1]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1393 iTemp = piOrg[ 2] - piCur[ 2]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1394 iTemp = piOrg[ 3] - piCur[ 3]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1395 iTemp = piOrg[ 4] - piCur[ 4]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1396 iTemp = piOrg[ 5] - piCur[ 5]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1397 iTemp = piOrg[ 6] - piCur[ 6]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1398 iTemp = piOrg[ 7] - piCur[ 7]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1399 iTemp = piOrg[ 8] - piCur[ 8]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1400 iTemp = piOrg[ 9] - piCur[ 9]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1401 iTemp = piOrg[10] - piCur[10]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1402 iTemp = piOrg[11] - piCur[11]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1403 iTemp = piOrg[12] - piCur[12]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1404 iTemp = piOrg[13] - piCur[13]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1405 iTemp = piOrg[14] - piCur[14]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1406 iTemp = piOrg[15] - piCur[15]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1407 iTemp = piOrg[16] - piCur[16]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1408 iTemp = piOrg[17] - piCur[17]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1409 iTemp = piOrg[18] - piCur[18]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1410 iTemp = piOrg[19] - piCur[19]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1411 iTemp = piOrg[20] - piCur[20]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1412 iTemp = piOrg[21] - piCur[21]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1413 iTemp = piOrg[22] - piCur[22]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1414 iTemp = piOrg[23] - piCur[23]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1415 iTemp = piOrg[24] - piCur[24]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1416 iTemp = piOrg[25] - piCur[25]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1417 iTemp = piOrg[26] - piCur[26]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1418 iTemp = piOrg[27] - piCur[27]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1419 iTemp = piOrg[28] - piCur[28]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1420 iTemp = piOrg[29] - piCur[29]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1421 iTemp = piOrg[30] - piCur[30]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1422 iTemp = piOrg[31] - piCur[31]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1424 piOrg += iStrideOrg;
1425 piCur += iStrideCur;
1435 assert( pcDtParam->
iCols == 64 );
1438 const Pel* piOrg = pcDtParam->
pOrg;
1439 const Pel* piCur = pcDtParam->
pCur;
1449 for( ; iRows != 0; iRows-- )
1451 iTemp = piOrg[ 0] - piCur[ 0]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1452 iTemp = piOrg[ 1] - piCur[ 1]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1453 iTemp = piOrg[ 2] - piCur[ 2]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1454 iTemp = piOrg[ 3] - piCur[ 3]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1455 iTemp = piOrg[ 4] - piCur[ 4]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1456 iTemp = piOrg[ 5] - piCur[ 5]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1457 iTemp = piOrg[ 6] - piCur[ 6]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1458 iTemp = piOrg[ 7] - piCur[ 7]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1459 iTemp = piOrg[ 8] - piCur[ 8]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1460 iTemp = piOrg[ 9] - piCur[ 9]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1461 iTemp = piOrg[10] - piCur[10]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1462 iTemp = piOrg[11] - piCur[11]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1463 iTemp = piOrg[12] - piCur[12]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1464 iTemp = piOrg[13] - piCur[13]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1465 iTemp = piOrg[14] - piCur[14]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1466 iTemp = piOrg[15] - piCur[15]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1467 iTemp = piOrg[16] - piCur[16]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1468 iTemp = piOrg[17] - piCur[17]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1469 iTemp = piOrg[18] - piCur[18]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1470 iTemp = piOrg[19] - piCur[19]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1471 iTemp = piOrg[20] - piCur[20]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1472 iTemp = piOrg[21] - piCur[21]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1473 iTemp = piOrg[22] - piCur[22]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1474 iTemp = piOrg[23] - piCur[23]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1475 iTemp = piOrg[24] - piCur[24]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1476 iTemp = piOrg[25] - piCur[25]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1477 iTemp = piOrg[26] - piCur[26]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1478 iTemp = piOrg[27] - piCur[27]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1479 iTemp = piOrg[28] - piCur[28]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1480 iTemp = piOrg[29] - piCur[29]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1481 iTemp = piOrg[30] - piCur[30]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1482 iTemp = piOrg[31] - piCur[31]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1483 iTemp = piOrg[32] - piCur[32]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1484 iTemp = piOrg[33] - piCur[33]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1485 iTemp = piOrg[34] - piCur[34]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1486 iTemp = piOrg[35] - piCur[35]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1487 iTemp = piOrg[36] - piCur[36]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1488 iTemp = piOrg[37] - piCur[37]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1489 iTemp = piOrg[38] - piCur[38]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1490 iTemp = piOrg[39] - piCur[39]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1491 iTemp = piOrg[40] - piCur[40]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1492 iTemp = piOrg[41] - piCur[41]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1493 iTemp = piOrg[42] - piCur[42]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1494 iTemp = piOrg[43] - piCur[43]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1495 iTemp = piOrg[44] - piCur[44]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1496 iTemp = piOrg[45] - piCur[45]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1497 iTemp = piOrg[46] - piCur[46]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1498 iTemp = piOrg[47] - piCur[47]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1499 iTemp = piOrg[48] - piCur[48]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1500 iTemp = piOrg[49] - piCur[49]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1501 iTemp = piOrg[50] - piCur[50]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1502 iTemp = piOrg[51] - piCur[51]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1503 iTemp = piOrg[52] - piCur[52]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1504 iTemp = piOrg[53] - piCur[53]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1505 iTemp = piOrg[54] - piCur[54]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1506 iTemp = piOrg[55] - piCur[55]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1507 iTemp = piOrg[56] - piCur[56]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1508 iTemp = piOrg[57] - piCur[57]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1509 iTemp = piOrg[58] - piCur[58]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1510 iTemp = piOrg[59] - piCur[59]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1511 iTemp = piOrg[60] - piCur[60]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1512 iTemp = piOrg[61] - piCur[61]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1513 iTemp = piOrg[62] - piCur[62]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1514 iTemp = piOrg[63] - piCur[63]; uiSum +=
Distortion(( iTemp * iTemp ) >> uiShift);
1516 piOrg += iStrideOrg;
1517 piCur += iStrideCur;
1531 assert( iStep == 1 );
1532 diff[0] = piOrg[0 ] - piCur[0];
1533 diff[1] = piOrg[1 ] - piCur[1];
1534 diff[2] = piOrg[iStrideOrg ] - piCur[0 + iStrideCur];
1535 diff[3] = piOrg[iStrideOrg + 1] - piCur[1 + iStrideCur];
1536 m[0] = diff[0] + diff[2];
1537 m[1] = diff[1] + diff[3];
1538 m[2] = diff[0] - diff[2];
1539 m[3] = diff[1] - diff[3];
1541 satd += abs(m[0] + m[1]);
1542 satd += abs(m[0] - m[1]);
1543 satd += abs(m[2] + m[3]);
1544 satd += abs(m[2] - m[3]);
1553 TCoeff diff[16], m[16], d[16];
1555 assert( iStep == 1 );
1556 for( k = 0; k < 16; k+=4 )
1558 diff[k+0] = piOrg[0] - piCur[0];
1559 diff[k+1] = piOrg[1] - piCur[1];
1560 diff[k+2] = piOrg[2] - piCur[2];
1561 diff[k+3] = piOrg[3] - piCur[3];
1563 piCur += iStrideCur;
1564 piOrg += iStrideOrg;
1568 m[ 0] = diff[ 0] + diff[12];
1569 m[ 1] = diff[ 1] + diff[13];
1570 m[ 2] = diff[ 2] + diff[14];
1571 m[ 3] = diff[ 3] + diff[15];
1572 m[ 4] = diff[ 4] + diff[ 8];
1573 m[ 5] = diff[ 5] + diff[ 9];
1574 m[ 6] = diff[ 6] + diff[10];
1575 m[ 7] = diff[ 7] + diff[11];
1576 m[ 8] = diff[ 4] - diff[ 8];
1577 m[ 9] = diff[ 5] - diff[ 9];
1578 m[10] = diff[ 6] - diff[10];
1579 m[11] = diff[ 7] - diff[11];
1580 m[12] = diff[ 0] - diff[12];
1581 m[13] = diff[ 1] - diff[13];
1582 m[14] = diff[ 2] - diff[14];
1583 m[15] = diff[ 3] - diff[15];
1585 d[ 0] = m[ 0] + m[ 4];
1586 d[ 1] = m[ 1] + m[ 5];
1587 d[ 2] = m[ 2] + m[ 6];
1588 d[ 3] = m[ 3] + m[ 7];
1589 d[ 4] = m[ 8] + m[12];
1590 d[ 5] = m[ 9] + m[13];
1591 d[ 6] = m[10] + m[14];
1592 d[ 7] = m[11] + m[15];
1593 d[ 8] = m[ 0] - m[ 4];
1594 d[ 9] = m[ 1] - m[ 5];
1595 d[10] = m[ 2] - m[ 6];
1596 d[11] = m[ 3] - m[ 7];
1597 d[12] = m[12] - m[ 8];
1598 d[13] = m[13] - m[ 9];
1599 d[14] = m[14] - m[10];
1600 d[15] = m[15] - m[11];
1602 m[ 0] = d[ 0] + d[ 3];
1603 m[ 1] = d[ 1] + d[ 2];
1604 m[ 2] = d[ 1] - d[ 2];
1605 m[ 3] = d[ 0] - d[ 3];
1606 m[ 4] = d[ 4] + d[ 7];
1607 m[ 5] = d[ 5] + d[ 6];
1608 m[ 6] = d[ 5] - d[ 6];
1609 m[ 7] = d[ 4] - d[ 7];
1610 m[ 8] = d[ 8] + d[11];
1611 m[ 9] = d[ 9] + d[10];
1612 m[10] = d[ 9] - d[10];
1613 m[11] = d[ 8] - d[11];
1614 m[12] = d[12] + d[15];
1615 m[13] = d[13] + d[14];
1616 m[14] = d[13] - d[14];
1617 m[15] = d[12] - d[15];
1619 d[ 0] = m[ 0] + m[ 1];
1620 d[ 1] = m[ 0] - m[ 1];
1621 d[ 2] = m[ 2] + m[ 3];
1622 d[ 3] = m[ 3] - m[ 2];
1623 d[ 4] = m[ 4] + m[ 5];
1624 d[ 5] = m[ 4] - m[ 5];
1625 d[ 6] = m[ 6] + m[ 7];
1626 d[ 7] = m[ 7] - m[ 6];
1627 d[ 8] = m[ 8] + m[ 9];
1628 d[ 9] = m[ 8] - m[ 9];
1629 d[10] = m[10] + m[11];
1630 d[11] = m[11] - m[10];
1631 d[12] = m[12] + m[13];
1632 d[13] = m[12] - m[13];
1633 d[14] = m[14] + m[15];
1634 d[15] = m[15] - m[14];
1636 for (k=0; k<16; ++k)
1640 satd = ((satd+1)>>1);
1651 #if VECTOR_CODING__DISTORTION_CALCULATIONS && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
1652 if( bitDepth <= 10 )
1654 return( simdHADs8x8( piOrg , piCur , iStrideOrg , iStrideCur ) );
1659 TCoeff diff[64], m1[8][8], m2[8][8], m3[8][8];
1660 assert( iStep == 1 );
1661 for( k = 0; k < 64; k += 8 )
1663 diff[k+0] = piOrg[0] - piCur[0];
1664 diff[k+1] = piOrg[1] - piCur[1];
1665 diff[k+2] = piOrg[2] - piCur[2];
1666 diff[k+3] = piOrg[3] - piCur[3];
1667 diff[k+4] = piOrg[4] - piCur[4];
1668 diff[k+5] = piOrg[5] - piCur[5];
1669 diff[k+6] = piOrg[6] - piCur[6];
1670 diff[k+7] = piOrg[7] - piCur[7];
1672 piCur += iStrideCur;
1673 piOrg += iStrideOrg;
1677 for (j=0; j < 8; j++)
1680 m2[j][0] = diff[jj ] + diff[jj+4];
1681 m2[j][1] = diff[jj+1] + diff[jj+5];
1682 m2[j][2] = diff[jj+2] + diff[jj+6];
1683 m2[j][3] = diff[jj+3] + diff[jj+7];
1684 m2[j][4] = diff[jj ] - diff[jj+4];
1685 m2[j][5] = diff[jj+1] - diff[jj+5];
1686 m2[j][6] = diff[jj+2] - diff[jj+6];
1687 m2[j][7] = diff[jj+3] - diff[jj+7];
1689 m1[j][0] = m2[j][0] + m2[j][2];
1690 m1[j][1] = m2[j][1] + m2[j][3];
1691 m1[j][2] = m2[j][0] - m2[j][2];
1692 m1[j][3] = m2[j][1] - m2[j][3];
1693 m1[j][4] = m2[j][4] + m2[j][6];
1694 m1[j][5] = m2[j][5] + m2[j][7];
1695 m1[j][6] = m2[j][4] - m2[j][6];
1696 m1[j][7] = m2[j][5] - m2[j][7];
1698 m2[j][0] = m1[j][0] + m1[j][1];
1699 m2[j][1] = m1[j][0] - m1[j][1];
1700 m2[j][2] = m1[j][2] + m1[j][3];
1701 m2[j][3] = m1[j][2] - m1[j][3];
1702 m2[j][4] = m1[j][4] + m1[j][5];
1703 m2[j][5] = m1[j][4] - m1[j][5];
1704 m2[j][6] = m1[j][6] + m1[j][7];
1705 m2[j][7] = m1[j][6] - m1[j][7];
1709 for (i=0; i < 8; i++)
1711 m3[0][i] = m2[0][i] + m2[4][i];
1712 m3[1][i] = m2[1][i] + m2[5][i];
1713 m3[2][i] = m2[2][i] + m2[6][i];
1714 m3[3][i] = m2[3][i] + m2[7][i];
1715 m3[4][i] = m2[0][i] - m2[4][i];
1716 m3[5][i] = m2[1][i] - m2[5][i];
1717 m3[6][i] = m2[2][i] - m2[6][i];
1718 m3[7][i] = m2[3][i] - m2[7][i];
1720 m1[0][i] = m3[0][i] + m3[2][i];
1721 m1[1][i] = m3[1][i] + m3[3][i];
1722 m1[2][i] = m3[0][i] - m3[2][i];
1723 m1[3][i] = m3[1][i] - m3[3][i];
1724 m1[4][i] = m3[4][i] + m3[6][i];
1725 m1[5][i] = m3[5][i] + m3[7][i];
1726 m1[6][i] = m3[4][i] - m3[6][i];
1727 m1[7][i] = m3[5][i] - m3[7][i];
1729 m2[0][i] = m1[0][i] + m1[1][i];
1730 m2[1][i] = m1[0][i] - m1[1][i];
1731 m2[2][i] = m1[2][i] + m1[3][i];
1732 m2[3][i] = m1[2][i] - m1[3][i];
1733 m2[4][i] = m1[4][i] + m1[5][i];
1734 m2[5][i] = m1[4][i] - m1[5][i];
1735 m2[6][i] = m1[6][i] + m1[7][i];
1736 m2[7][i] = m1[6][i] - m1[7][i];
1739 for (i = 0; i < 8; i++)
1741 for (j = 0; j < 8; j++)
1743 sad += abs(m2[i][j]);
1759 const Pel* piOrg = pcDtParam->
pOrg;
1760 const Pel* piCur = pcDtParam->
pCur;
1761 const Int iRows = pcDtParam->
iRows;
1762 const Int iCols = pcDtParam->
iCols;
1765 const Int iStep = pcDtParam->
iStep;
1771 if( ( iRows % 8 == 0) && (iCols % 8 == 0) )
1773 Int iOffsetOrg = iStrideOrg<<3;
1774 Int iOffsetCur = iStrideCur<<3;
1775 for ( y=0; y<iRows; y+= 8 )
1777 for ( x=0; x<iCols; x+= 8 )
1779 uiSum +=
xCalcHADs8x8( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep
1785 piOrg += iOffsetOrg;
1786 piCur += iOffsetCur;
1789 else if( ( iRows % 4 == 0) && (iCols % 4 == 0) )
1791 Int iOffsetOrg = iStrideOrg<<2;
1792 Int iOffsetCur = iStrideCur<<2;
1794 for ( y=0; y<iRows; y+= 4 )
1796 for ( x=0; x<iCols; x+= 4 )
1798 uiSum +=
xCalcHADs4x4( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep );
1800 piOrg += iOffsetOrg;
1801 piCur += iOffsetCur;
1804 else if( ( iRows % 2 == 0) && (iCols % 2 == 0) )
1806 Int iOffsetOrg = iStrideOrg<<1;
1807 Int iOffsetCur = iStrideCur<<1;
1808 for ( y=0; y<iRows; y+=2 )
1810 for ( x=0; x<iCols; x+=2 )
1812 uiSum +=
xCalcHADs2x2( &piOrg[x], &piCur[x*iStep], iStrideOrg, iStrideCur, iStep );
1814 piOrg += iOffsetOrg;
1815 piCur += iOffsetCur;
SChar g_aucConvertToBit[MAX_CU_SIZE+1]
UInt Distortion
distortion measurement
static Distortion xGetSAD48(DistParam *pcDtParam)
static Distortion xGetHADs(DistParam *pcDtParam)
global variables & functions (header)
static Distortion xGetSSE4(DistParam *pcDtParam)
static Distortion xGetSAD64(DistParam *pcDtParam)
Int getROIYHeight() const
neighbouring pixel access class for all components
RD cost computation classes (header)
static Distortion xCalcHADs2x2(const Pel *piOrg, const Pel *piCurr, Int iStrideOrg, Int iStrideCur, Int iStep)
static Distortion xGetSSE(DistParam *pcDtParam)
Double m_dLambdaMotionSAD[2]
static Distortion xGetSAD32(DistParam *pcDtParam)
Int recon[MAX_NUM_CHANNEL_TYPE]
the bit depth as indicated in the SPS
#define RExt__HIGH_BIT_DEPTH_SUPPORT
0 (default) use data type definitions for 8-10 bit video, 1 = use larger data types to allow for up t...
static Distortion xGetSAD24(DistParam *pcDtParam)
static Distortion xCalcHADs8x8(const Pel *piOrg, const Pel *piCurr, Int iStrideOrg, Int iStrideCur, Int iStep)
Distortion xGetHADsw(DistParam *pcDtParam)
get weighted Hadamard cost
Int Intermediate_Int
used as intermediate value in calculations
Int iSubShift
During cost calculations, if distortion exceeds this value, cost calculations may early-terminate...
static Distortion xCalcHADs4x4(const Pel *piOrg, const Pel *piCurr, Int iStrideOrg, Int iStrideCur, Int iStep)
Double calcRdCost(Double numBits, Double distortion, DFunc eDFunc=DF_DEFAULT)
static Distortion xGetSAD16(DistParam *pcDtParam)
static const Int LOSSLESS_AND_MIXED_LOSSLESS_RD_COST_TEST_QP_PRIME
QP' to use for mixed_lossy_lossless coding.
static Distortion xGetSSE8(DistParam *pcDtParam)
distortion parameter class
static Distortion xGetSSE16(DistParam *pcDtParam)
static Distortion xGetSSE16N(DistParam *pcDtParam)
Void setLambda(Double dLambda, const BitDepths &bitDepths)
static Distortion xGetSSE64(DistParam *pcDtParam)
static Distortion xGetSAD(DistParam *pcDtParam)
Distortion m_maximumDistortionForEarlyExit
Int TCoeff
transform coefficient
Double m_dLambdaMotionSSE[2]
Double m_distortionWeight[MAX_NUM_COMPONENT]
Void setDistParam(UInt uiBlkWidth, UInt uiBlkHeight, DFunc eDFunc, DistParam &rcDistParam)
#define VECTOR_CODING__DISTORTION_CALCULATIONS
enable vector coding for distortion calculations 0 (default if SSE not possible) disable SSE vector c...
static Distortion xGetSAD4(DistParam *pcDtParam)
static Distortion xGetSSE32(DistParam *pcDtParam)
static Distortion xGetSAD16N(DistParam *pcDtParam)
general size Hadamard with step
#define DISTORTION_PRECISION_ADJUSTMENT(x)
Distortion getDistPart(Int bitDepth, const Pel *piCur, Int iCurStride, const Pel *piOrg, Int iOrgStride, UInt uiBlkWidth, UInt uiBlkHeight, const ComponentID compID, DFunc eDFunc=DF_SSE)
static UInt xGetExpGolombNumberOfBits(Int iVal)
FpDistFunc m_afpDistortFunc[DF_TOTAL_FUNCTIONS]
Int getPatternLStride() const
static Distortion xGetSAD12(DistParam *pcDtParam)
Distortion xGetSSEw(DistParam *pcDtParam)
Distortion calcHAD(Int bitDepth, const Pel *pi0, Int iStride0, const Pel *pi1, Int iStride1, Int iWidth, Int iHeight)
static Distortion xGetSAD8(DistParam *pcDtParam)
Distortion xGetSADw(DistParam *pcDtParam)
DFunc
distortion function index
general size SAD with step