1 | /* The copyright in this software is being made available under the BSD |
---|
2 | * License, included below. This software may be subject to other third party |
---|
3 | * and contributor rights, including patent rights, and no such rights are |
---|
4 | * granted under this license. |
---|
5 | * |
---|
6 | * Copyright (c) 2010-2017, ITU/ISO/IEC |
---|
7 | * All rights reserved. |
---|
8 | * |
---|
9 | * Redistribution and use in source and binary forms, with or without |
---|
10 | * modification, are permitted provided that the following conditions are met: |
---|
11 | * |
---|
12 | * * Redistributions of source code must retain the above copyright notice, |
---|
13 | * this list of conditions and the following disclaimer. |
---|
14 | * * Redistributions in binary form must reproduce the above copyright notice, |
---|
15 | * this list of conditions and the following disclaimer in the documentation |
---|
16 | * and/or other materials provided with the distribution. |
---|
17 | * * Neither the name of the ITU/ISO/IEC nor the names of its contributors may |
---|
18 | * be used to endorse or promote products derived from this software without |
---|
19 | * specific prior written permission. |
---|
20 | * |
---|
21 | * THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" |
---|
22 | * AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE |
---|
23 | * IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE |
---|
24 | * ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS |
---|
25 | * BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR |
---|
26 | * CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF |
---|
27 | * SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS |
---|
28 | * INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN |
---|
29 | * CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) |
---|
30 | * ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF |
---|
31 | * THE POSSIBILITY OF SUCH DAMAGE. |
---|
32 | */ |
---|
33 | |
---|
34 | /** |
---|
35 | * \file |
---|
36 | * \brief Implementation of TComInterpolationFilter class |
---|
37 | */ |
---|
38 | |
---|
39 | // ==================================================================================================================== |
---|
40 | // Includes |
---|
41 | // ==================================================================================================================== |
---|
42 | |
---|
43 | #include "TComRom.h" |
---|
44 | #include "TComInterpolationFilter.h" |
---|
45 | #include <assert.h> |
---|
46 | |
---|
47 | #include "TComChromaFormat.h" |
---|
48 | |
---|
49 | #if VECTOR_CODING__INTERPOLATION_FILTER && (RExt__HIGH_BIT_DEPTH_SUPPORT==0) |
---|
50 | #include <emmintrin.h> |
---|
51 | #endif |
---|
52 | |
---|
53 | //! \ingroup TLibCommon |
---|
54 | //! \{ |
---|
55 | |
---|
56 | // ==================================================================================================================== |
---|
57 | // Tables |
---|
58 | // ==================================================================================================================== |
---|
59 | |
---|
60 | const TFilterCoeff TComInterpolationFilter::m_lumaFilter[LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS][NTAPS_LUMA] = |
---|
61 | { |
---|
62 | { 0, 0, 0, 64, 0, 0, 0, 0 }, |
---|
63 | { -1, 4, -10, 58, 17, -5, 1, 0 }, |
---|
64 | { -1, 4, -11, 40, 40, -11, 4, -1 }, |
---|
65 | { 0, 1, -5, 17, 58, -10, 4, -1 } |
---|
66 | }; |
---|
67 | |
---|
68 | const TFilterCoeff TComInterpolationFilter::m_chromaFilter[CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS][NTAPS_CHROMA] = |
---|
69 | { |
---|
70 | { 0, 64, 0, 0 }, |
---|
71 | { -2, 58, 10, -2 }, |
---|
72 | { -4, 54, 16, -2 }, |
---|
73 | { -6, 46, 28, -4 }, |
---|
74 | { -4, 36, 36, -4 }, |
---|
75 | { -4, 28, 46, -6 }, |
---|
76 | { -2, 16, 54, -4 }, |
---|
77 | { -2, 10, 58, -2 } |
---|
78 | }; |
---|
79 | |
---|
80 | #if VECTOR_CODING__INTERPOLATION_FILTER && (RExt__HIGH_BIT_DEPTH_SUPPORT==0) |
---|
81 | inline __m128i simdInterpolateLuma4( Short const *src , Int srcStride , __m128i *mmCoeff , const __m128i & mmOffset , Int shift ) |
---|
82 | { |
---|
83 | __m128i sumHi = _mm_setzero_si128(); |
---|
84 | __m128i sumLo = _mm_setzero_si128(); |
---|
85 | for( Int n = 0 ; n < 8 ; n++ ) |
---|
86 | { |
---|
87 | __m128i mmPix = _mm_loadl_epi64( ( __m128i* )src ); |
---|
88 | __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] ); |
---|
89 | __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] ); |
---|
90 | sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) ); |
---|
91 | sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) ); |
---|
92 | src += srcStride; |
---|
93 | } |
---|
94 | sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift ); |
---|
95 | sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift ); |
---|
96 | return( _mm_packs_epi32( sumLo , sumHi ) ); |
---|
97 | } |
---|
98 | |
---|
99 | inline __m128i simdInterpolateChroma4( Short const *src , Int srcStride , __m128i *mmCoeff , const __m128i & mmOffset , Int shift ) |
---|
100 | { |
---|
101 | __m128i sumHi = _mm_setzero_si128(); |
---|
102 | __m128i sumLo = _mm_setzero_si128(); |
---|
103 | for( Int n = 0 ; n < 4 ; n++ ) |
---|
104 | { |
---|
105 | __m128i mmPix = _mm_loadl_epi64( ( __m128i* )src ); |
---|
106 | __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] ); |
---|
107 | __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] ); |
---|
108 | sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) ); |
---|
109 | sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) ); |
---|
110 | src += srcStride; |
---|
111 | } |
---|
112 | sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift ); |
---|
113 | sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift ); |
---|
114 | return( _mm_packs_epi32( sumLo , sumHi ) ); |
---|
115 | } |
---|
116 | |
---|
117 | inline __m128i simdInterpolateLuma8( Short const *src , Int srcStride , __m128i *mmCoeff , const __m128i & mmOffset , Int shift ) |
---|
118 | { |
---|
119 | __m128i sumHi = _mm_setzero_si128(); |
---|
120 | __m128i sumLo = _mm_setzero_si128(); |
---|
121 | for( Int n = 0 ; n < 8 ; n++ ) |
---|
122 | { |
---|
123 | __m128i mmPix = _mm_loadu_si128( ( __m128i* )src ); |
---|
124 | __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] ); |
---|
125 | __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] ); |
---|
126 | sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) ); |
---|
127 | sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) ); |
---|
128 | src += srcStride; |
---|
129 | } |
---|
130 | sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift ); |
---|
131 | sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift ); |
---|
132 | return( _mm_packs_epi32( sumLo , sumHi ) ); |
---|
133 | } |
---|
134 | |
---|
135 | inline __m128i simdInterpolateLuma2P8( Short const *src , Int srcStride , __m128i *mmCoeff , const __m128i & mmOffset , Int shift ) |
---|
136 | { |
---|
137 | __m128i sumHi = _mm_setzero_si128(); |
---|
138 | __m128i sumLo = _mm_setzero_si128(); |
---|
139 | for( Int n = 0 ; n < 2 ; n++ ) |
---|
140 | { |
---|
141 | __m128i mmPix = _mm_loadu_si128( ( __m128i* )src ); |
---|
142 | __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] ); |
---|
143 | __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] ); |
---|
144 | sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) ); |
---|
145 | sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) ); |
---|
146 | src += srcStride; |
---|
147 | } |
---|
148 | sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift ); |
---|
149 | sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift ); |
---|
150 | return( _mm_packs_epi32( sumLo , sumHi ) ); |
---|
151 | } |
---|
152 | |
---|
153 | inline __m128i simdInterpolateLuma2P4( Short const *src , Int srcStride , __m128i *mmCoeff , const __m128i & mmOffset , Int shift ) |
---|
154 | { |
---|
155 | __m128i sumHi = _mm_setzero_si128(); |
---|
156 | __m128i sumLo = _mm_setzero_si128(); |
---|
157 | for( Int n = 0 ; n < 2 ; n++ ) |
---|
158 | { |
---|
159 | __m128i mmPix = _mm_loadl_epi64( ( __m128i* )src ); |
---|
160 | __m128i hi = _mm_mulhi_epi16( mmPix , mmCoeff[n] ); |
---|
161 | __m128i lo = _mm_mullo_epi16( mmPix , mmCoeff[n] ); |
---|
162 | sumHi = _mm_add_epi32( sumHi , _mm_unpackhi_epi16( lo , hi ) ); |
---|
163 | sumLo = _mm_add_epi32( sumLo , _mm_unpacklo_epi16( lo , hi ) ); |
---|
164 | src += srcStride; |
---|
165 | } |
---|
166 | sumHi = _mm_srai_epi32( _mm_add_epi32( sumHi , mmOffset ) , shift ); |
---|
167 | sumLo = _mm_srai_epi32( _mm_add_epi32( sumLo , mmOffset ) , shift ); |
---|
168 | return( _mm_packs_epi32( sumLo , sumHi ) ); |
---|
169 | } |
---|
170 | |
---|
171 | inline __m128i simdClip3( __m128i mmMin , __m128i mmMax , __m128i mmPix ) |
---|
172 | { |
---|
173 | __m128i mmMask = _mm_cmpgt_epi16( mmPix , mmMin ); |
---|
174 | mmPix = _mm_or_si128( _mm_and_si128( mmMask , mmPix ) , _mm_andnot_si128( mmMask , mmMin ) ); |
---|
175 | mmMask = _mm_cmplt_epi16( mmPix , mmMax ); |
---|
176 | mmPix = _mm_or_si128( _mm_and_si128( mmMask , mmPix ) , _mm_andnot_si128( mmMask , mmMax ) ); |
---|
177 | return( mmPix ); |
---|
178 | } |
---|
179 | #endif |
---|
180 | |
---|
181 | #if NH_3D |
---|
182 | const Short TComInterpolationFilter::m_lumaFilterARP[4][NTAPS_LUMA_ARP] = |
---|
183 | { |
---|
184 | {64, 0}, |
---|
185 | {48, 16}, |
---|
186 | {32, 32}, |
---|
187 | {16, 48} |
---|
188 | }; |
---|
189 | const Short TComInterpolationFilter::m_chromaFilterARP[8][NTAPS_CHROMA_ARP] = |
---|
190 | { |
---|
191 | {64, 0}, |
---|
192 | {56, 8}, |
---|
193 | {48, 16}, |
---|
194 | {40, 24}, |
---|
195 | {32, 32}, |
---|
196 | {24, 40}, |
---|
197 | {16, 48}, |
---|
198 | {8, 56} |
---|
199 | }; |
---|
200 | #endif |
---|
201 | |
---|
202 | // ==================================================================================================================== |
---|
203 | // Private member functions |
---|
204 | // ==================================================================================================================== |
---|
205 | |
---|
206 | /** |
---|
207 | * \brief Apply unit FIR filter to a block of samples |
---|
208 | * |
---|
209 | * \param bitDepth bitDepth of samples |
---|
210 | * \param src Pointer to source samples |
---|
211 | * \param srcStride Stride of source samples |
---|
212 | * \param dst Pointer to destination samples |
---|
213 | * \param dstStride Stride of destination samples |
---|
214 | * \param width Width of block |
---|
215 | * \param height Height of block |
---|
216 | * \param isFirst Flag indicating whether it is the first filtering operation |
---|
217 | * \param isLast Flag indicating whether it is the last filtering operation |
---|
218 | */ |
---|
219 | Void TComInterpolationFilter::filterCopy(Int bitDepth, const Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Bool isFirst, Bool isLast) |
---|
220 | { |
---|
221 | Int row, col; |
---|
222 | |
---|
223 | if ( isFirst == isLast ) |
---|
224 | { |
---|
225 | for (row = 0; row < height; row++) |
---|
226 | { |
---|
227 | for (col = 0; col < width; col++) |
---|
228 | { |
---|
229 | dst[col] = src[col]; |
---|
230 | } |
---|
231 | |
---|
232 | src += srcStride; |
---|
233 | dst += dstStride; |
---|
234 | } |
---|
235 | } |
---|
236 | else if ( isFirst ) |
---|
237 | { |
---|
238 | const Int shift = std::max<Int>(2, (IF_INTERNAL_PREC - bitDepth)); |
---|
239 | |
---|
240 | for (row = 0; row < height; row++) |
---|
241 | { |
---|
242 | for (col = 0; col < width; col++) |
---|
243 | { |
---|
244 | Pel val = leftShift_round(src[col], shift); |
---|
245 | dst[col] = val - (Pel)IF_INTERNAL_OFFS; |
---|
246 | } |
---|
247 | |
---|
248 | src += srcStride; |
---|
249 | dst += dstStride; |
---|
250 | } |
---|
251 | } |
---|
252 | else |
---|
253 | { |
---|
254 | const Int shift = std::max<Int>(2, (IF_INTERNAL_PREC - bitDepth)); |
---|
255 | |
---|
256 | Pel maxVal = (1 << bitDepth) - 1; |
---|
257 | Pel minVal = 0; |
---|
258 | for (row = 0; row < height; row++) |
---|
259 | { |
---|
260 | for (col = 0; col < width; col++) |
---|
261 | { |
---|
262 | Pel val = src[ col ]; |
---|
263 | val = rightShift_round((val + IF_INTERNAL_OFFS), shift); |
---|
264 | if (val < minVal) |
---|
265 | { |
---|
266 | val = minVal; |
---|
267 | } |
---|
268 | if (val > maxVal) |
---|
269 | { |
---|
270 | val = maxVal; |
---|
271 | } |
---|
272 | dst[col] = val; |
---|
273 | } |
---|
274 | |
---|
275 | src += srcStride; |
---|
276 | dst += dstStride; |
---|
277 | } |
---|
278 | } |
---|
279 | } |
---|
280 | |
---|
281 | /** |
---|
282 | * \brief Apply FIR filter to a block of samples |
---|
283 | * |
---|
284 | * \tparam N Number of taps |
---|
285 | * \tparam isVertical Flag indicating filtering along vertical direction |
---|
286 | * \tparam isFirst Flag indicating whether it is the first filtering operation |
---|
287 | * \tparam isLast Flag indicating whether it is the last filtering operation |
---|
288 | * \param bitDepth Bit depth of samples |
---|
289 | * \param src Pointer to source samples |
---|
290 | * \param srcStride Stride of source samples |
---|
291 | * \param dst Pointer to destination samples |
---|
292 | * \param dstStride Stride of destination samples |
---|
293 | * \param width Width of block |
---|
294 | * \param height Height of block |
---|
295 | * \param coeff Pointer to filter taps |
---|
296 | */ |
---|
297 | template<Int N, Bool isVertical, Bool isFirst, Bool isLast> |
---|
298 | Void TComInterpolationFilter::filter(Int bitDepth, Pel const *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, TFilterCoeff const *coeff) |
---|
299 | { |
---|
300 | Int row, col; |
---|
301 | |
---|
302 | Pel c[8]; |
---|
303 | c[0] = coeff[0]; |
---|
304 | c[1] = coeff[1]; |
---|
305 | if ( N >= 4 ) |
---|
306 | { |
---|
307 | c[2] = coeff[2]; |
---|
308 | c[3] = coeff[3]; |
---|
309 | } |
---|
310 | if ( N >= 6 ) |
---|
311 | { |
---|
312 | c[4] = coeff[4]; |
---|
313 | c[5] = coeff[5]; |
---|
314 | } |
---|
315 | if ( N == 8 ) |
---|
316 | { |
---|
317 | c[6] = coeff[6]; |
---|
318 | c[7] = coeff[7]; |
---|
319 | } |
---|
320 | |
---|
321 | Int cStride = ( isVertical ) ? srcStride : 1; |
---|
322 | src -= ( N/2 - 1 ) * cStride; |
---|
323 | |
---|
324 | Int offset; |
---|
325 | Pel maxVal; |
---|
326 | Int headRoom = std::max<Int>(2, (IF_INTERNAL_PREC - bitDepth)); |
---|
327 | Int shift = IF_FILTER_PREC; |
---|
328 | // with the current settings (IF_INTERNAL_PREC = 14 and IF_FILTER_PREC = 6), though headroom can be |
---|
329 | // negative for bit depths greater than 14, shift will remain non-negative for bit depths of 8->20 |
---|
330 | assert(shift >= 0); |
---|
331 | |
---|
332 | if ( isLast ) |
---|
333 | { |
---|
334 | shift += (isFirst) ? 0 : headRoom; |
---|
335 | offset = 1 << (shift - 1); |
---|
336 | offset += (isFirst) ? 0 : IF_INTERNAL_OFFS << IF_FILTER_PREC; |
---|
337 | maxVal = (1 << bitDepth) - 1; |
---|
338 | } |
---|
339 | else |
---|
340 | { |
---|
341 | shift -= (isFirst) ? headRoom : 0; |
---|
342 | offset = (isFirst) ? -IF_INTERNAL_OFFS << shift : 0; |
---|
343 | maxVal = 0; |
---|
344 | } |
---|
345 | |
---|
346 | #if VECTOR_CODING__INTERPOLATION_FILTER && (RExt__HIGH_BIT_DEPTH_SUPPORT==0) |
---|
347 | if( bitDepth <= 10 ) |
---|
348 | { |
---|
349 | if( N == 8 && !( width & 0x07 ) ) |
---|
350 | { |
---|
351 | Short minVal = 0; |
---|
352 | __m128i mmOffset = _mm_set1_epi32( offset ); |
---|
353 | __m128i mmCoeff[8]; |
---|
354 | __m128i mmMin = _mm_set1_epi16( minVal ); |
---|
355 | __m128i mmMax = _mm_set1_epi16( maxVal ); |
---|
356 | for( Int n = 0 ; n < 8 ; n++ ) |
---|
357 | mmCoeff[n] = _mm_set1_epi16( c[n] ); |
---|
358 | for( row = 0 ; row < height ; row++ ) |
---|
359 | { |
---|
360 | for( col = 0 ; col < width ; col += 8 ) |
---|
361 | { |
---|
362 | __m128i mmFiltered = simdInterpolateLuma8( src + col , cStride , mmCoeff , mmOffset , shift ); |
---|
363 | if( isLast ) |
---|
364 | { |
---|
365 | mmFiltered = simdClip3( mmMin , mmMax , mmFiltered ); |
---|
366 | } |
---|
367 | _mm_storeu_si128( ( __m128i * )( dst + col ) , mmFiltered ); |
---|
368 | } |
---|
369 | src += srcStride; |
---|
370 | dst += dstStride; |
---|
371 | } |
---|
372 | return; |
---|
373 | } |
---|
374 | else if( N == 8 && !( width & 0x03 ) ) |
---|
375 | { |
---|
376 | Short minVal = 0; |
---|
377 | __m128i mmOffset = _mm_set1_epi32( offset ); |
---|
378 | __m128i mmCoeff[8]; |
---|
379 | __m128i mmMin = _mm_set1_epi16( minVal ); |
---|
380 | __m128i mmMax = _mm_set1_epi16( maxVal ); |
---|
381 | for( Int n = 0 ; n < 8 ; n++ ) |
---|
382 | mmCoeff[n] = _mm_set1_epi16( c[n] ); |
---|
383 | for( row = 0 ; row < height ; row++ ) |
---|
384 | { |
---|
385 | for( col = 0 ; col < width ; col += 4 ) |
---|
386 | { |
---|
387 | __m128i mmFiltered = simdInterpolateLuma4( src + col , cStride , mmCoeff , mmOffset , shift ); |
---|
388 | if( isLast ) |
---|
389 | { |
---|
390 | mmFiltered = simdClip3( mmMin , mmMax , mmFiltered ); |
---|
391 | } |
---|
392 | _mm_storel_epi64( ( __m128i * )( dst + col ) , mmFiltered ); |
---|
393 | } |
---|
394 | src += srcStride; |
---|
395 | dst += dstStride; |
---|
396 | } |
---|
397 | return; |
---|
398 | } |
---|
399 | else if( N == 4 && !( width & 0x03 ) ) |
---|
400 | { |
---|
401 | Short minVal = 0; |
---|
402 | __m128i mmOffset = _mm_set1_epi32( offset ); |
---|
403 | __m128i mmCoeff[8]; |
---|
404 | __m128i mmMin = _mm_set1_epi16( minVal ); |
---|
405 | __m128i mmMax = _mm_set1_epi16( maxVal ); |
---|
406 | for( Int n = 0 ; n < 4 ; n++ ) |
---|
407 | mmCoeff[n] = _mm_set1_epi16( c[n] ); |
---|
408 | for( row = 0 ; row < height ; row++ ) |
---|
409 | { |
---|
410 | for( col = 0 ; col < width ; col += 4 ) |
---|
411 | { |
---|
412 | __m128i mmFiltered = simdInterpolateChroma4( src + col , cStride , mmCoeff , mmOffset , shift ); |
---|
413 | if( isLast ) |
---|
414 | { |
---|
415 | mmFiltered = simdClip3( mmMin , mmMax , mmFiltered ); |
---|
416 | } |
---|
417 | _mm_storel_epi64( ( __m128i * )( dst + col ) , mmFiltered ); |
---|
418 | } |
---|
419 | src += srcStride; |
---|
420 | dst += dstStride; |
---|
421 | } |
---|
422 | return; |
---|
423 | } |
---|
424 | else if( N == 2 && !( width & 0x07 ) ) |
---|
425 | { |
---|
426 | Short minVal = 0; |
---|
427 | __m128i mmOffset = _mm_set1_epi32( offset ); |
---|
428 | __m128i mmCoeff[2]; |
---|
429 | __m128i mmMin = _mm_set1_epi16( minVal ); |
---|
430 | __m128i mmMax = _mm_set1_epi16( maxVal ); |
---|
431 | for( Int n = 0 ; n < 2 ; n++ ) |
---|
432 | mmCoeff[n] = _mm_set1_epi16( c[n] ); |
---|
433 | for( row = 0 ; row < height ; row++ ) |
---|
434 | { |
---|
435 | for( col = 0 ; col < width ; col += 8 ) |
---|
436 | { |
---|
437 | __m128i mmFiltered = simdInterpolateLuma2P8( src + col , cStride , mmCoeff , mmOffset , shift ); |
---|
438 | if( isLast ) |
---|
439 | { |
---|
440 | mmFiltered = simdClip3( mmMin , mmMax , mmFiltered ); |
---|
441 | } |
---|
442 | _mm_storeu_si128( ( __m128i * )( dst + col ) , mmFiltered ); |
---|
443 | } |
---|
444 | src += srcStride; |
---|
445 | dst += dstStride; |
---|
446 | } |
---|
447 | return; |
---|
448 | } |
---|
449 | else if( N == 2 && !( width & 0x03 ) ) |
---|
450 | { |
---|
451 | Short minVal = 0; |
---|
452 | __m128i mmOffset = _mm_set1_epi32( offset ); |
---|
453 | __m128i mmCoeff[8]; |
---|
454 | __m128i mmMin = _mm_set1_epi16( minVal ); |
---|
455 | __m128i mmMax = _mm_set1_epi16( maxVal ); |
---|
456 | for( Int n = 0 ; n < 2 ; n++ ) |
---|
457 | mmCoeff[n] = _mm_set1_epi16( c[n] ); |
---|
458 | for( row = 0 ; row < height ; row++ ) |
---|
459 | { |
---|
460 | for( col = 0 ; col < width ; col += 4 ) |
---|
461 | { |
---|
462 | __m128i mmFiltered = simdInterpolateLuma2P4( src + col , cStride , mmCoeff , mmOffset , shift ); |
---|
463 | if( isLast ) |
---|
464 | { |
---|
465 | mmFiltered = simdClip3( mmMin , mmMax , mmFiltered ); |
---|
466 | } |
---|
467 | _mm_storel_epi64( ( __m128i * )( dst + col ) , mmFiltered ); |
---|
468 | } |
---|
469 | src += srcStride; |
---|
470 | dst += dstStride; |
---|
471 | } |
---|
472 | return; |
---|
473 | } |
---|
474 | } |
---|
475 | #endif |
---|
476 | |
---|
477 | for (row = 0; row < height; row++) |
---|
478 | { |
---|
479 | for (col = 0; col < width; col++) |
---|
480 | { |
---|
481 | Int sum; |
---|
482 | |
---|
483 | sum = src[ col + 0 * cStride] * c[0]; |
---|
484 | sum += src[ col + 1 * cStride] * c[1]; |
---|
485 | if ( N >= 4 ) |
---|
486 | { |
---|
487 | sum += src[ col + 2 * cStride] * c[2]; |
---|
488 | sum += src[ col + 3 * cStride] * c[3]; |
---|
489 | } |
---|
490 | if ( N >= 6 ) |
---|
491 | { |
---|
492 | sum += src[ col + 4 * cStride] * c[4]; |
---|
493 | sum += src[ col + 5 * cStride] * c[5]; |
---|
494 | } |
---|
495 | if ( N == 8 ) |
---|
496 | { |
---|
497 | sum += src[ col + 6 * cStride] * c[6]; |
---|
498 | sum += src[ col + 7 * cStride] * c[7]; |
---|
499 | } |
---|
500 | |
---|
501 | Pel val = ( sum + offset ) >> shift; |
---|
502 | if ( isLast ) |
---|
503 | { |
---|
504 | val = ( val < 0 ) ? 0 : val; |
---|
505 | val = ( val > maxVal ) ? maxVal : val; |
---|
506 | } |
---|
507 | dst[col] = val; |
---|
508 | } |
---|
509 | |
---|
510 | src += srcStride; |
---|
511 | dst += dstStride; |
---|
512 | } |
---|
513 | } |
---|
514 | |
---|
515 | /** |
---|
516 | * \brief Filter a block of samples (horizontal) |
---|
517 | * |
---|
518 | * \tparam N Number of taps |
---|
519 | * \param bitDepth Bit depth of samples |
---|
520 | * \param src Pointer to source samples |
---|
521 | * \param srcStride Stride of source samples |
---|
522 | * \param dst Pointer to destination samples |
---|
523 | * \param dstStride Stride of destination samples |
---|
524 | * \param width Width of block |
---|
525 | * \param height Height of block |
---|
526 | * \param isLast Flag indicating whether it is the last filtering operation |
---|
527 | * \param coeff Pointer to filter taps |
---|
528 | */ |
---|
529 | template<Int N> |
---|
530 | Void TComInterpolationFilter::filterHor(Int bitDepth, Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Bool isLast, TFilterCoeff const *coeff) |
---|
531 | { |
---|
532 | if ( isLast ) |
---|
533 | { |
---|
534 | filter<N, false, true, true>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff); |
---|
535 | } |
---|
536 | else |
---|
537 | { |
---|
538 | filter<N, false, true, false>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff); |
---|
539 | } |
---|
540 | } |
---|
541 | |
---|
542 | /** |
---|
543 | * \brief Filter a block of samples (vertical) |
---|
544 | * |
---|
545 | * \tparam N Number of taps |
---|
546 | * \param bitDepth Bit depth |
---|
547 | * \param src Pointer to source samples |
---|
548 | * \param srcStride Stride of source samples |
---|
549 | * \param dst Pointer to destination samples |
---|
550 | * \param dstStride Stride of destination samples |
---|
551 | * \param width Width of block |
---|
552 | * \param height Height of block |
---|
553 | * \param isFirst Flag indicating whether it is the first filtering operation |
---|
554 | * \param isLast Flag indicating whether it is the last filtering operation |
---|
555 | * \param coeff Pointer to filter taps |
---|
556 | */ |
---|
557 | template<Int N> |
---|
558 | Void TComInterpolationFilter::filterVer(Int bitDepth, Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Bool isFirst, Bool isLast, TFilterCoeff const *coeff) |
---|
559 | { |
---|
560 | if ( isFirst && isLast ) |
---|
561 | { |
---|
562 | filter<N, true, true, true>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff); |
---|
563 | } |
---|
564 | else if ( isFirst && !isLast ) |
---|
565 | { |
---|
566 | filter<N, true, true, false>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff); |
---|
567 | } |
---|
568 | else if ( !isFirst && isLast ) |
---|
569 | { |
---|
570 | filter<N, true, false, true>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff); |
---|
571 | } |
---|
572 | else |
---|
573 | { |
---|
574 | filter<N, true, false, false>(bitDepth, src, srcStride, dst, dstStride, width, height, coeff); |
---|
575 | } |
---|
576 | } |
---|
577 | |
---|
578 | // ==================================================================================================================== |
---|
579 | // Public member functions |
---|
580 | // ==================================================================================================================== |
---|
581 | |
---|
582 | /** |
---|
583 | * \brief Filter a block of Luma/Chroma samples (horizontal) |
---|
584 | * |
---|
585 | * \param compID Chroma component ID |
---|
586 | * \param src Pointer to source samples |
---|
587 | * \param srcStride Stride of source samples |
---|
588 | * \param dst Pointer to destination samples |
---|
589 | * \param dstStride Stride of destination samples |
---|
590 | * \param width Width of block |
---|
591 | * \param height Height of block |
---|
592 | * \param frac Fractional sample offset |
---|
593 | * \param isLast Flag indicating whether it is the last filtering operation |
---|
594 | * \param fmt Chroma format |
---|
595 | * \param bitDepth Bit depth |
---|
596 | */ |
---|
597 | #if NH_3D |
---|
598 | Void TComInterpolationFilter::filterHor(const ComponentID compID, Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Int frac, Bool isLast, const ChromaFormat fmt, const Int bitDepth, Bool filterType ) |
---|
599 | #else |
---|
600 | Void TComInterpolationFilter::filterHor(const ComponentID compID, Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Int frac, Bool isLast, const ChromaFormat fmt, const Int bitDepth ) |
---|
601 | #endif |
---|
602 | |
---|
603 | { |
---|
604 | if ( frac == 0 ) |
---|
605 | { |
---|
606 | filterCopy(bitDepth, src, srcStride, dst, dstStride, width, height, true, isLast ); |
---|
607 | } |
---|
608 | else if (isLuma(compID)) |
---|
609 | { |
---|
610 | assert(frac >= 0 && frac < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS); |
---|
611 | #if NH_3D |
---|
612 | if(filterType) |
---|
613 | { |
---|
614 | filterHor<NTAPS_LUMA_ARP>(bitDepth, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilterARP[frac]); |
---|
615 | } |
---|
616 | else |
---|
617 | { |
---|
618 | #endif |
---|
619 | filterHor<NTAPS_LUMA>(bitDepth, src, srcStride, dst, dstStride, width, height, isLast, m_lumaFilter[frac]); |
---|
620 | #if NH_3D |
---|
621 | } |
---|
622 | #endif |
---|
623 | |
---|
624 | } |
---|
625 | else |
---|
626 | { |
---|
627 | const UInt csx = getComponentScaleX(compID, fmt); |
---|
628 | assert(frac >=0 && csx<2 && (frac<<(1-csx)) < CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS); |
---|
629 | #if NH_3D |
---|
630 | if(filterType) |
---|
631 | { |
---|
632 | filterHor<NTAPS_CHROMA_ARP>(bitDepth, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilterARP[frac]); |
---|
633 | } |
---|
634 | else |
---|
635 | { |
---|
636 | #endif |
---|
637 | filterHor<NTAPS_CHROMA>(bitDepth, src, srcStride, dst, dstStride, width, height, isLast, m_chromaFilter[frac<<(1-csx)]); |
---|
638 | #if NH_3D |
---|
639 | } |
---|
640 | #endif |
---|
641 | } |
---|
642 | } |
---|
643 | |
---|
644 | |
---|
645 | /** |
---|
646 | * \brief Filter a block of Luma/Chroma samples (vertical) |
---|
647 | * |
---|
648 | * \param compID Colour component ID |
---|
649 | * \param src Pointer to source samples |
---|
650 | * \param srcStride Stride of source samples |
---|
651 | * \param dst Pointer to destination samples |
---|
652 | * \param dstStride Stride of destination samples |
---|
653 | * \param width Width of block |
---|
654 | * \param height Height of block |
---|
655 | * \param frac Fractional sample offset |
---|
656 | * \param isFirst Flag indicating whether it is the first filtering operation |
---|
657 | * \param isLast Flag indicating whether it is the last filtering operation |
---|
658 | * \param fmt Chroma format |
---|
659 | * \param bitDepth Bit depth |
---|
660 | */ |
---|
661 | #if NH_3D |
---|
662 | Void TComInterpolationFilter::filterVer(const ComponentID compID, Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Int frac, Bool isFirst, Bool isLast, const ChromaFormat fmt, const Int bitDepth, Bool filterType ) |
---|
663 | #else |
---|
664 | Void TComInterpolationFilter::filterVer(const ComponentID compID, Pel *src, Int srcStride, Pel *dst, Int dstStride, Int width, Int height, Int frac, Bool isFirst, Bool isLast, const ChromaFormat fmt, const Int bitDepth ) |
---|
665 | #endif |
---|
666 | { |
---|
667 | if ( frac == 0 ) |
---|
668 | { |
---|
669 | filterCopy(bitDepth, src, srcStride, dst, dstStride, width, height, isFirst, isLast ); |
---|
670 | } |
---|
671 | else if (isLuma(compID)) |
---|
672 | { |
---|
673 | assert(frac >= 0 && frac < LUMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS); |
---|
674 | #if NH_3D |
---|
675 | if(filterType) |
---|
676 | { |
---|
677 | filterVer<NTAPS_LUMA_ARP>(bitDepth, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilterARP[frac]); |
---|
678 | } |
---|
679 | else |
---|
680 | { |
---|
681 | #endif |
---|
682 | filterVer<NTAPS_LUMA>(bitDepth, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_lumaFilter[frac]); |
---|
683 | #if NH_3D |
---|
684 | } |
---|
685 | #endif |
---|
686 | |
---|
687 | } |
---|
688 | else |
---|
689 | { |
---|
690 | const UInt csy = getComponentScaleY(compID, fmt); |
---|
691 | assert(frac >=0 && csy<2 && (frac<<(1-csy)) < CHROMA_INTERPOLATION_FILTER_SUB_SAMPLE_POSITIONS); |
---|
692 | #if NH_3D |
---|
693 | if(filterType) |
---|
694 | { |
---|
695 | filterVer<NTAPS_CHROMA_ARP>(bitDepth, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilterARP[frac]); |
---|
696 | } |
---|
697 | else |
---|
698 | { |
---|
699 | #endif |
---|
700 | filterVer<NTAPS_CHROMA>(bitDepth, src, srcStride, dst, dstStride, width, height, isFirst, isLast, m_chromaFilter[frac<<(1-csy)]); |
---|
701 | #if NH_3D |
---|
702 | } |
---|
703 | #endif |
---|
704 | } |
---|
705 | } |
---|
706 | //! \} |
---|