49 #if VECTOR_CODING__INTERPOLATION_FILTER && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
50 #include <emmintrin.h>
62 { 0, 0, 0, 64, 0, 0, 0, 0 },
63 { -1, 4, -10, 58, 17, -5, 1, 0 },
64 { -1, 4, -11, 40, 40, -11, 4, -1 },
65 { 0, 1, -5, 17, 58, -10, 4, -1 }
80 #if VECTOR_CODING__INTERPOLATION_FILTER && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
81 inline __m128i simdInterpolateLuma4(
Short const *src ,
Int srcStride , __m128i *mmCoeff ,
const __m128i & mmOffset ,
Int shift )
83 __m128i sumHi = _mm_setzero_si128();
84 __m128i sumLo = _mm_setzero_si128();
85 for(
Int n = 0 ; n < 8 ; n++ )
87 __m128i mmPix = _mm_loadl_epi64( ( __m128i* )src );
88 __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] );
89 __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] );
90 sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) );
91 sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) );
94 sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift );
95 sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift );
96 return( _mm_packs_epi32( sumLo , sumHi ) );
99 inline __m128i simdInterpolateChroma4(
Short const *src ,
Int srcStride , __m128i *mmCoeff ,
const __m128i & mmOffset ,
Int shift )
101 __m128i sumHi = _mm_setzero_si128();
102 __m128i sumLo = _mm_setzero_si128();
103 for(
Int n = 0 ; n < 4 ; n++ )
105 __m128i mmPix = _mm_loadl_epi64( ( __m128i* )src );
106 __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] );
107 __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] );
108 sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) );
109 sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) );
112 sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift );
113 sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift );
114 return( _mm_packs_epi32( sumLo , sumHi ) );
117 inline __m128i simdInterpolateLuma8(
Short const *src ,
Int srcStride , __m128i *mmCoeff ,
const __m128i & mmOffset ,
Int shift )
119 __m128i sumHi = _mm_setzero_si128();
120 __m128i sumLo = _mm_setzero_si128();
121 for(
Int n = 0 ; n < 8 ; n++ )
123 __m128i mmPix = _mm_loadu_si128( ( __m128i* )src );
124 __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] );
125 __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] );
126 sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) );
127 sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) );
130 sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift );
131 sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift );
132 return( _mm_packs_epi32( sumLo , sumHi ) );
135 inline __m128i simdInterpolateLuma2P8(
Short const *src ,
Int srcStride , __m128i *mmCoeff ,
const __m128i & mmOffset ,
Int shift )
137 __m128i sumHi = _mm_setzero_si128();
138 __m128i sumLo = _mm_setzero_si128();
139 for(
Int n = 0 ; n < 2 ; n++ )
141 __m128i mmPix = _mm_loadu_si128( ( __m128i* )src );
142 __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] );
143 __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] );
144 sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) );
145 sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) );
148 sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift );
149 sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift );
150 return( _mm_packs_epi32( sumLo , sumHi ) );
153 inline __m128i simdInterpolateLuma2P4(
Short const *src ,
Int srcStride , __m128i *mmCoeff ,
const __m128i & mmOffset ,
Int shift )
155 __m128i sumHi = _mm_setzero_si128();
156 __m128i sumLo = _mm_setzero_si128();
157 for(
Int n = 0 ; n < 2 ; n++ )
159 __m128i mmPix = _mm_loadl_epi64( ( __m128i* )src );
160 __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] );
161 __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] );
162 sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) );
163 sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) );
166 sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift );
167 sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift );
168 return( _mm_packs_epi32( sumLo , sumHi ) );
171 inline __m128i simdClip3( __m128i mmMin , __m128i mmMax , __m128i mmPix )
173 __m128i mmMask = _mm_cmpgt_epi16( mmPix , mmMin );
174 mmPix = _mm_or_si128( _mm_and_si128( mmMask , mmPix ) , _mm_andnot_si128( mmMask , mmMin ) );
175 mmMask = _mm_cmplt_epi16( mmPix , mmMax );
176 mmPix = _mm_or_si128( _mm_and_si128( mmMask , mmPix ) , _mm_andnot_si128( mmMask , mmMax ) );
202 if ( isFirst == isLast )
204 for (row = 0; row < height; row++)
206 for (col = 0; col < width; col++)
219 for (row = 0; row < height; row++)
221 for (col = 0; col < width; col++)
235 Pel maxVal = (1 << bitDepth) - 1;
237 for (row = 0; row < height; row++)
239 for (col = 0; col < width; col++)
241 Pel val = src[ col ];
276 template<Int N, Bool isVertical, Bool isFirst, Bool isLast>
300 Int cStride = ( isVertical ) ? srcStride : 1;
301 src -= ( N/2 - 1 ) * cStride;
313 shift += (isFirst) ? 0 : headRoom;
314 offset = 1 << (shift - 1);
316 maxVal = (1 << bitDepth) - 1;
320 shift -= (isFirst) ? headRoom : 0;
325 #if VECTOR_CODING__INTERPOLATION_FILTER && (RExt__HIGH_BIT_DEPTH_SUPPORT==0)
328 if( N == 8 && !( width & 0x07 ) )
331 __m128i mmOffset = _mm_set1_epi32( offset );
333 __m128i mmMin = _mm_set1_epi16( minVal );
334 __m128i mmMax = _mm_set1_epi16( maxVal );
335 for(
Int n = 0 ; n < 8 ; n++ )
336 mmCoeff[n] = _mm_set1_epi16( c[n] );
337 for( row = 0 ; row < height ; row++ )
339 for( col = 0 ; col < width ; col += 8 )
341 __m128i mmFiltered = simdInterpolateLuma8( src + col , cStride , mmCoeff , mmOffset , shift );
344 mmFiltered = simdClip3( mmMin , mmMax , mmFiltered );
346 _mm_storeu_si128( ( __m128i * )( dst + col ) , mmFiltered );
353 else if( N == 8 && !( width & 0x03 ) )
356 __m128i mmOffset = _mm_set1_epi32( offset );
358 __m128i mmMin = _mm_set1_epi16( minVal );
359 __m128i mmMax = _mm_set1_epi16( maxVal );
360 for(
Int n = 0 ; n < 8 ; n++ )
361 mmCoeff[n] = _mm_set1_epi16( c[n] );
362 for( row = 0 ; row < height ; row++ )
364 for( col = 0 ; col < width ; col += 4 )
366 __m128i mmFiltered = simdInterpolateLuma4( src + col , cStride , mmCoeff , mmOffset , shift );
369 mmFiltered = simdClip3( mmMin , mmMax , mmFiltered );
371 _mm_storel_epi64( ( __m128i * )( dst + col ) , mmFiltered );
378 else if( N == 4 && !( width & 0x03 ) )
381 __m128i mmOffset = _mm_set1_epi32( offset );
383 __m128i mmMin = _mm_set1_epi16( minVal );
384 __m128i mmMax = _mm_set1_epi16( maxVal );
385 for(
Int n = 0 ; n < 4 ; n++ )
386 mmCoeff[n] = _mm_set1_epi16( c[n] );
387 for( row = 0 ; row < height ; row++ )
389 for( col = 0 ; col < width ; col += 4 )
391 __m128i mmFiltered = simdInterpolateChroma4( src + col , cStride , mmCoeff , mmOffset , shift );
394 mmFiltered = simdClip3( mmMin , mmMax , mmFiltered );
396 _mm_storel_epi64( ( __m128i * )( dst + col ) , mmFiltered );
403 else if( N == 2 && !( width & 0x07 ) )
406 __m128i mmOffset = _mm_set1_epi32( offset );
408 __m128i mmMin = _mm_set1_epi16( minVal );
409 __m128i mmMax = _mm_set1_epi16( maxVal );
410 for(
Int n = 0 ; n < 2 ; n++ )
411 mmCoeff[n] = _mm_set1_epi16( c[n] );
412 for( row = 0 ; row < height ; row++ )
414 for( col = 0 ; col < width ; col += 8 )
416 __m128i mmFiltered = simdInterpolateLuma2P8( src + col , cStride , mmCoeff , mmOffset , shift );
419 mmFiltered = simdClip3( mmMin , mmMax , mmFiltered );
421 _mm_storeu_si128( ( __m128i * )( dst + col ) , mmFiltered );
428 else if( N == 2 && !( width & 0x03 ) )
431 __m128i mmOffset = _mm_set1_epi32( offset );
433 __m128i mmMin = _mm_set1_epi16( minVal );
434 __m128i mmMax = _mm_set1_epi16( maxVal );
435 for(
Int n = 0 ; n < 2 ; n++ )
436 mmCoeff[n] = _mm_set1_epi16( c[n] );
437 for( row = 0 ; row < height ; row++ )
439 for( col = 0 ; col < width ; col += 4 )
441 __m128i mmFiltered = simdInterpolateLuma2P4( src + col , cStride , mmCoeff , mmOffset , shift );
444 mmFiltered = simdClip3( mmMin , mmMax , mmFiltered );
446 _mm_storel_epi64( ( __m128i * )( dst + col ) , mmFiltered );
456 for (row = 0; row < height; row++)
458 for (col = 0; col < width; col++)
462 sum = src[ col + 0 * cStride] * c[0];
463 sum += src[ col + 1 * cStride] * c[1];
466 sum += src[ col + 2 * cStride] * c[2];
467 sum += src[ col + 3 * cStride] * c[3];
471 sum += src[ col + 4 * cStride] * c[4];
472 sum += src[ col + 5 * cStride] * c[5];
476 sum += src[ col + 6 * cStride] * c[6];
477 sum += src[ col + 7 * cStride] * c[7];
480 Pel val = ( sum + offset ) >> shift;
483 val = ( val < 0 ) ? 0 : val;
484 val = ( val > maxVal ) ? maxVal : val;
513 filter<N, false, true, true>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff);
517 filter<N, false, true, false>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff);
539 if ( isFirst && isLast )
541 filter<N, true, true, true>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff);
543 else if ( isFirst && !isLast )
545 filter<N, true, true, false>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff);
547 else if ( !isFirst && isLast )
549 filter<N, true, false, true>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff);
553 filter<N, true, false, false>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff);
576 Void TComInterpolationFilter::filterHor(
const ComponentID compID,
Pel *src,
Int srcStride,
Pel *dst,
Int dstStride,
Int width,
Int height,
Int frac,
Bool isLast,
const ChromaFormat fmt,
const Int bitDepth )
580 filterCopy(bitDepth, src, srcStride, dst, dstStride, width, height,
true, isLast );
585 filterHor<NTAPS_LUMA>(bitDepth, src, srcStride, dst, dstStride, width, height, isLast,
m_lumaFilter[frac]);
591 filterHor<NTAPS_CHROMA>(bitDepth, src, srcStride, dst, dstStride, width, height, isLast,
m_chromaFilter[frac<<(1-csx)]);
612 Void TComInterpolationFilter::filterVer(
const ComponentID compID,
Pel *src,
Int srcStride,
Pel *dst,
Int dstStride,
Int width,
Int height,
Int frac,
Bool isFirst,
Bool isLast,
const ChromaFormat fmt,
const Int bitDepth )
616 filterCopy(bitDepth, src, srcStride, dst, dstStride, width, height, isFirst, isLast );
621 filterVer<NTAPS_LUMA>(bitDepth, src, srcStride, dst, dstStride, width, height, isFirst, isLast,
m_lumaFilter[frac]);
627 filterVer<NTAPS_CHROMA>(bitDepth, src, srcStride, dst, dstStride, width, height, isFirst, isLast,
m_chromaFilter[frac<<(1-csy)]);
#define IF_FILTER_PREC
Log2 of sum of filter taps.
static Void filterCopy(Int bitDepth, const Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Bool isFirst, Bool isLast)
Apply unit FIR filter to a block of samples.
Short TFilterCoeff
filter coefficient
static const Int LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS
static const TFilterCoeff m_lumaFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS][8]
Luma filter taps.
ValueType leftShift_round(const ValueType value, const Int shift)
global variables & functions (header)
Declaration of TComInterpolationFilter class.
#define IF_INTERNAL_OFFS
Offset used internally.
static const Int CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS
#define IF_INTERNAL_PREC
Number of bits for internal precision.
ValueType rightShift_round(const ValueType value, const Int shift)
ChromaFormat
chroma formats (according to semantics of chroma_format_idc)
static Void filterVer(Int bitDepth, Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Bool isFirst, Bool isLast, TFilterCoeff const *coeff)
Filter a block of samples (vertical)
static Void filter(Int bitDepth, Pel const *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, TFilterCoeff const *coeff)
Apply FIR filter to a block of samples.
#define NTAPS_LUMA
Number of taps for luma.
#define NTAPS_CHROMA
Number of taps for chroma.
static Void filterHor(Int bitDepth, Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Bool isLast, TFilterCoeff const *coeff)
Filter a block of samples (horizontal)
static const TFilterCoeff m_chromaFilter[CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS][4]
Chroma filter taps.