| 1 | #include "src_sinc.h" |
| 2 | static double ifp1 = 0.000244141; |
| 3 | static int fpm1 = 0xfff; |
| 4 | |
| 5 | #if (defined(ARCH_X86_32) && defined(MMX)) |
| 6 | // gcc -O3: About 25% faster on x86 but C is faster on amd64 |
| 7 | double calc_output(SINC_FILTER *filter, increment_t increment, |
| 8 | increment_t start_filter_index, int ch) |
| 9 | { |
| 10 | double fpm[2] ALIGN16; |
| 11 | double ifp[2] ALIGN16; |
| 12 | double incd[2] ALIGN16; |
| 13 | increment_t incsl[4] ALIGN16; |
| 14 | increment_t incsr[4] ALIGN16; |
| 15 | double ret ALIGN16; |
| 16 | double fraction, icoeff; |
| 17 | increment_t filter_index, max_filter_index; |
| 18 | long i, indx, data_index; |
| 19 | int coeff_count; |
| 20 | |
| 21 | incsl[0] = incsr[3] = 0; |
| 22 | incsl[1] = incsr[2] = increment; |
| 23 | incsl[2] = incsr[1] = increment * 2; |
| 24 | incsl[3] = incsr[0] = increment * 3; |
| 25 | |
| 26 | max_filter_index = int_to_fp(filter->coeff_half_len); |
| 27 | |
| 28 | /* First apply the left half of the filter. */ |
| 29 | filter_index = start_filter_index ; |
| 30 | coeff_count = (max_filter_index - filter_index) / increment ; |
| 31 | filter_index = filter_index + coeff_count * increment ; |
| 32 | data_index = filter->b_current - filter->channels * coeff_count + ch ; |
| 33 | |
| 34 | ret = 0.0; |
| 35 | |
| 36 | asm volatile ( |
| 37 | // prep |
| 38 | "movd %6, %%xmm2 \n\t" // increment * 4 |
| 39 | "movsd %10, %%xmm4 \n\t" // 0 ifp1 |
| 40 | "punpckldq %%xmm2, %%xmm2 \n\t" |
| 41 | "movd %11, %%xmm3 \n\t" // 0 0 0 fpm1 |
| 42 | "unpcklpd %%xmm4, %%xmm4 \n\t" // ifp1 ifp1 |
| 43 | "movd %3, %%xmm7 \n\t" // 0 0 0 FI |
| 44 | "punpckldq %%xmm3, %%xmm3 \n\t" |
| 45 | "movdqa %%xmm4, %8 \n\t" // 8 = INV_FP_ONE |
| 46 | "punpckldq %%xmm3, %%xmm3 \n\t" // fpm1 fpm1 fpm1 fpm1 |
| 47 | "movdqa %2, %%xmm1 \n\t" // inc*3 inc*2 inc 0 |
| 48 | "punpckldq %%xmm2, %%xmm2 \n\t" // inc*4 inc*4 inc*4 inc*4 |
| 49 | "movdqa %%xmm3, %7 \n\t" // 7 = frac_part_mask |
| 50 | "punpckldq %%xmm7, %%xmm7 \n\t" // 0 0 FI FI |
| 51 | "pslld $2, %%xmm2 \n\t" |
| 52 | "punpckldq %%xmm7, %%xmm7 \n\t" // FI FI FI FI |
| 53 | "movdqa %%xmm2, %9 \n\t" // 9 = INCD |
| 54 | "psubd %%xmm1, %%xmm7 \n\t" // fi3 fi2 fi1 fi0 |
| 55 | // using 7 |
| 56 | // loop |
| 57 | "1: \n\t" |
| 58 | "pshufd $0xe4,%%xmm7, %%xmm3\n\t" |
| 59 | "movdqa %%xmm7, %%xmm2 \n\t" |
| 60 | "pand %7, %%xmm3 \n\t" // fp_fraction_parts |
| 61 | "psrld $12, %%xmm2 \n\t" // indx3 indx2 indx1 indx0 |
| 62 | "cvtdq2pd %%xmm3, %%xmm4 \n\t" // fpfp 1 fpfp 0 |
| 63 | "psrldq $8, %%xmm3 \n\t" |
| 64 | "movd %%xmm2, %%eax \n\t" // indx0 |
| 65 | "cvtdq2pd %%xmm3, %%xmm5 \n\t" // fpfp 3 fpfp 2 |
| 66 | "mulpd %8, %%xmm4 \n\t" // frac 1 frac 0 |
| 67 | "movlps (%4,%%eax,4), %%xmm1\n\t" // 0 0 c01 c0 |
| 68 | "mulpd %8, %%xmm5 \n\t" // frac 3 frac 2 |
| 69 | // using 1 2 4 5 7 |
| 70 | "psrldq $4, %%xmm2 \n\t" |
| 71 | "cvtps2pd %%xmm1, %%xmm1 \n\t" // c01 c0 |
| 72 | "movd %%xmm2, %%eax \n\t" // indx1 |
| 73 | "psrldq $4, %%xmm2 \n\t" |
| 74 | "movlps (%4,%%eax,4), %%xmm3\n\t" // 0 0 c11 c1 |
| 75 | "movd %%xmm2, %%eax \n\t" // indx2 |
| 76 | "cvtps2pd %%xmm3, %%xmm3 \n\t" // c11 c1 |
| 77 | "movdqa %%xmm1, %%xmm6 \n\t" |
| 78 | "psrldq $4, %%xmm2 \n\t" |
| 79 | "unpcklpd %%xmm3, %%xmm1 \n\t" // c1 c0 |
| 80 | "movlps (%4,%%eax,4), %%xmm0\n\t" // 0 0 c21 c2 |
| 81 | "unpckhpd %%xmm3, %%xmm6 \n\t" // c11 c01 |
| 82 | "movd %%xmm2, %%eax \n\t" // indx3 |
| 83 | "cvtps2pd %%xmm0, %%xmm0 \n\t" // c21 c2 |
| 84 | // using 0 1 4 5 6 7 |
| 85 | "movlps (%4,%%eax,4), %%xmm2\n\t" // 0 0 c31 c3 |
| 86 | "movdqa %%xmm0, %%xmm3 \n\t" |
| 87 | "cvtps2pd %%xmm2, %%xmm2 \n\t" // c31 c3 |
| 88 | "subpd %%xmm1, %%xmm6 \n\t" // c11-c1 c01-c0 |
| 89 | "unpcklpd %%xmm2, %%xmm0 \n\t" // c3 c2 |
| 90 | "mulpd %%xmm4, %%xmm6 \n\t" // frac1*c11-c1 frac0*c01-c0 |
| 91 | "unpckhpd %%xmm2, %%xmm3 \n\t" // c31 c21 |
| 92 | // using 0 1 3 5 6 7 |
| 93 | "addpd %%xmm6, %%xmm1 \n\t" // icoeff1 icoeff0 |
| 94 | "subpd %%xmm0, %%xmm3 \n\t" // c31-c3 c21-c2 |
| 95 | "movups (%5, %1, 4), %%xmm2 \n\t" // d3 d2 d1 d0 |
| 96 | "mulpd %%xmm5, %%xmm3 \n\t" // frac3*c31-c3 frac0*c21-c2 |
| 97 | "movups 16(%5,%1,4), %%xmm4 \n\t" // d7 d6 d5 d4 |
| 98 | "cvtps2pd %%xmm2, %%xmm5 \n\t" // d1 d0 |
| 99 | "addpd %%xmm3, %%xmm0 \n\t" // icoeff3 icoeff2 |
| 100 | // using 0 1 2 4 7 |
| 101 | "psrldq $8, %%xmm2 \n\t" |
| 102 | "cvtps2pd %%xmm4, %%xmm3 \n\t" // d5 d4 |
| 103 | "cvtps2pd %%xmm2, %%xmm6 \n\t" // d3 d2 |
| 104 | "psrldq $8, %%xmm4 \n\t" |
| 105 | "unpcklpd %%xmm6, %%xmm5 \n\t" // d2 d0 |
| 106 | "cvtps2pd %%xmm4, %%xmm2 \n\t" // d7 d6 |
| 107 | // using 0 1 2 3 5 7 |
| 108 | "mulpd %%xmm1, %%xmm5 \n\t" |
| 109 | "unpcklpd %%xmm2, %%xmm3 \n\t" // d6 d4 |
| 110 | "psubd %9, %%xmm7 \n\t" // fi -= increment (x 4) |
| 111 | "mulpd %%xmm0, %%xmm3 \n\t" |
| 112 | "movdqa %%xmm7, %%xmm2 \n\t" |
| 113 | "addpd %%xmm5, %%xmm3 \n\t" |
| 114 | "add $8, %1 \n\t" // data_index += 4 * channels |
| 115 | "haddpd %%xmm3, %%xmm3 \n\t" |
| 116 | "psrldq $12, %%xmm2 \n\t" |
| 117 | "movd %%xmm2, %%eax \n\t" |
| 118 | // using a 3 7 |
| 119 | "addsd %0, %%xmm3 \n\t" // ret += (d * icoeff) x 4 |
| 120 | "test %%eax, %%eax \n\t" // lowest fi >= 0 ? |
| 121 | "movsd %%xmm3, %0 \n\t" |
| 122 | "jns 1b \n\t" |
| 123 | "movdqa %%xmm7, %2 \n\t" |
| 124 | :"=m"(ret),"+r"(data_index) |
| 125 | :"m"(incsl[0]),"m"(filter_index),"r"(filter->coeffs), |
| 126 | "r"(filter->buffer),"m"(increment),"m"(fpm[0]),"m"(ifp[0]), |
| 127 | "m"(incd[0]),"m"(ifp1),"m"(fpm1) |
| 128 | :"eax","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","memory" |
| 129 | ); |
| 130 | |
| 131 | for (i = 0; i < 3; i++) |
| 132 | { |
| 133 | if (incsl[i] >= 0) |
| 134 | { |
| 135 | fraction = fp_to_double(incsl[i]); |
| 136 | indx = fp_to_int(incsl[i]); |
| 137 | icoeff = filter->coeffs[indx] + fraction * |
| 138 | (filter->coeffs[indx + 1] - filter->coeffs[indx]); |
| 139 | ret += icoeff * filter->buffer[data_index]; |
| 140 | data_index += 2; |
| 141 | } |
| 142 | } |
| 143 | |
| 144 | /* Now apply the right half of the filter. */ |
| 145 | filter_index = increment - start_filter_index ; |
| 146 | coeff_count = (max_filter_index - filter_index) / increment ; |
| 147 | filter_index = filter_index + coeff_count * increment ; |
| 148 | data_index = filter->b_current + filter->channels * (1 + coeff_count) + ch; |
| 149 | |
| 150 | asm volatile ( |
| 151 | // prep |
| 152 | "movd %3, %%xmm7 \n\t" // 0 0 0 FI |
| 153 | "punpckldq %%xmm7, %%xmm7 \n\t" // 0 0 FI FI |
| 154 | "movdqa %2, %%xmm5 \n\t" // 0 inc inc*2 inc*3 |
| 155 | "punpckldq %%xmm7, %%xmm7 \n\t" // FI FI FI FI |
| 156 | "psubd %%xmm5, %%xmm7 \n\t" // fi0 fi1 fi2 fi3 |
| 157 | // using 7 |
| 158 | // loop |
| 159 | "2: \n\t" |
| 160 | "pshufd $0xe4,%%xmm7, %%xmm3\n\t" |
| 161 | "movdqa %%xmm7, %%xmm2 \n\t" |
| 162 | "pand %7, %%xmm3 \n\t" // fp_fraction_parts |
| 163 | "psrld $12, %%xmm2 \n\t" // indx0 indx1 indx2 indx3 |
| 164 | "cvtdq2pd %%xmm3, %%xmm4 \n\t" // fpfp 2 fpfp 3 |
| 165 | "psrldq $8, %%xmm3 \n\t" |
| 166 | "movd %%xmm2, %%eax \n\t" // indx3 |
| 167 | "cvtdq2pd %%xmm3, %%xmm5 \n\t" // fpfp 0 fpfp 1 |
| 168 | "mulpd %8, %%xmm4 \n\t" // frac 2 frac 3 |
| 169 | "movlps (%4,%%eax,4), %%xmm1\n\t" // 0 0 c31 c3 |
| 170 | "mulpd %8, %%xmm5 \n\t" // frac 0 frac 1 |
| 171 | // using 1 2 4 5 7 |
| 172 | "psrldq $4, %%xmm2 \n\t" |
| 173 | "cvtps2pd %%xmm1, %%xmm1 \n\t" // c31 c3 |
| 174 | "movd %%xmm2, %%eax \n\t" // indx2 |
| 175 | "psrldq $4, %%xmm2 \n\t" |
| 176 | "movlps (%4,%%eax,4), %%xmm3\n\t" // 0 0 c21 c2 |
| 177 | "movd %%xmm2, %%eax \n\t" // indx1 |
| 178 | "cvtps2pd %%xmm3, %%xmm3 \n\t" // c21 c2 |
| 179 | "movdqa %%xmm1, %%xmm6 \n\t" |
| 180 | "psrldq $4, %%xmm2 \n\t" |
| 181 | "unpcklpd %%xmm3, %%xmm1 \n\t" // c2 c3 |
| 182 | "movlps (%4,%%eax,4), %%xmm0\n\t" // 0 0 c11 c1 |
| 183 | "unpckhpd %%xmm3, %%xmm6 \n\t" // c21 c31 |
| 184 | "movd %%xmm2, %%eax \n\t" // indx0 |
| 185 | "cvtps2pd %%xmm0, %%xmm0 \n\t" // c11 c1 |
| 186 | // using 0 1 2 4 5 6 7 |
| 187 | "movlps (%4,%%eax,4), %%xmm2\n\t" // 0 0 c01 c0 |
| 188 | "movdqa %%xmm0, %%xmm3 \n\t" |
| 189 | "cvtps2pd %%xmm2, %%xmm2 \n\t" // c01 c0 |
| 190 | "subpd %%xmm1, %%xmm6 \n\t" // c21-c2 c31-c3 |
| 191 | "unpcklpd %%xmm2, %%xmm0 \n\t" // c0 c1 |
| 192 | "mulpd %%xmm4, %%xmm6 \n\t" // frac2*c21-c2 frac3*c31-c3 |
| 193 | "unpckhpd %%xmm2, %%xmm3 \n\t" // c01 c11 |
| 194 | // using 0 1 3 5 6 7 |
| 195 | "addpd %%xmm6, %%xmm1 \n\t" // icoeff2 icoeff3 |
| 196 | "subpd %%xmm0, %%xmm3 \n\t" // c01-c0 c11-c1 |
| 197 | "movups -12(%5,%1,4), %%xmm2\n\t" // d0 d1 d2 d3 |
| 198 | "mulpd %%xmm5, %%xmm3 \n\t" // frac1*c01-c0 frac0*c11-c1 |
| 199 | "movups -28(%5,%1,4), %%xmm4\n\t" // d4 d5 d6 d7 |
| 200 | "cvtps2pd %%xmm2, %%xmm5 \n\t" // d2 d3 |
| 201 | "addpd %%xmm3, %%xmm0 \n\t" // icoeff0 icoeff1 |
| 202 | // using 0 1 2 4 5 7 |
| 203 | "psrldq $8, %%xmm2 \n\t" |
| 204 | "cvtps2pd %%xmm4, %%xmm3 \n\t" // d6 d7 |
| 205 | "cvtps2pd %%xmm2, %%xmm6 \n\t" // d0 d1 |
| 206 | "psrldq $8, %%xmm4 \n\t" |
| 207 | "unpckhpd %%xmm6, %%xmm5 \n\t" // d0 d2 |
| 208 | "cvtps2pd %%xmm4, %%xmm2 \n\t" // d4 d5 |
| 209 | // using 0 1 2 3 5 7 |
| 210 | "mulpd %%xmm0, %%xmm5 \n\t" |
| 211 | "unpckhpd %%xmm2, %%xmm3 \n\t" // d4 d6 |
| 212 | "psubd %9, %%xmm7 \n\t" // fi -= increment (x 4) |
| 213 | "mulpd %%xmm1, %%xmm3 \n\t" |
| 214 | "movdqa %%xmm7, %%xmm2 \n\t" |
| 215 | "addpd %%xmm5, %%xmm3 \n\t" |
| 216 | "sub $8, %1 \n\t" // data_index -= 4 * channels |
| 217 | "haddpd %%xmm3, %%xmm3 \n\t" |
| 218 | "movd %%xmm2, %%eax \n\t" |
| 219 | // using a 3 7 |
| 220 | "addsd %0, %%xmm3 \n\t" // ret += (d * icoeff) x 4 |
| 221 | "test %%eax, %%eax \n\t" // (lowest) fi > 0 |
| 222 | "movsd %%xmm3, %0 \n\t" |
| 223 | "jg 2b \n\t" |
| 224 | "movdqa %%xmm7, %2 \n\t" |
| 225 | :"=m"(ret),"+r"(data_index) |
| 226 | :"m"(incsr[0]),"m"(filter_index),"r"(filter->coeffs), |
| 227 | "r"(filter->buffer),"m"(increment),"m"(fpm[0]),"m"(ifp[0]), |
| 228 | "m"(incd[0]) |
| 229 | :"eax","xmm0","xmm1","xmm2","xmm3","xmm4","xmm5","xmm6","xmm7","memory" |
| 230 | ); |
| 231 | |
| 232 | // do remaining few |
| 233 | for (i = 3; i > 0; i--) |
| 234 | { |
| 235 | if (incsr[i] > 0) |
| 236 | { |
| 237 | fraction = fp_to_double(incsr[i]); |
| 238 | indx = fp_to_int(incsr[i]); |
| 239 | icoeff = filter->coeffs[indx] + fraction * |
| 240 | (filter->coeffs[indx + 1] - filter->coeffs[indx]); |
| 241 | ret += icoeff * filter->buffer[data_index]; |
| 242 | data_index -= 2; |
| 243 | } |
| 244 | } |
| 245 | |
| 246 | return ret; |
| 247 | |
| 248 | } |
| 249 | #else |
| 250 | double calc_output(SINC_FILTER *filter, increment_t increment, |
| 251 | increment_t start_filter_index, int ch) |
| 252 | { |
| 253 | double fraction, left, right, icoeff ; |
| 254 | increment_t filter_index, max_filter_index ; |
| 255 | int data_index, coeff_count, indx ; |
| 256 | |
| 257 | /* Convert input parameters into fixed point. */ |
| 258 | max_filter_index = int_to_fp (filter->coeff_half_len) ; |
| 259 | |
| 260 | /* First apply the left half of the filter. */ |
| 261 | filter_index = start_filter_index ; |
| 262 | coeff_count = (max_filter_index - filter_index) / increment ; |
| 263 | filter_index = filter_index + coeff_count * increment ; |
| 264 | data_index = filter->b_current - filter->channels * coeff_count + ch ; |
| 265 | |
| 266 | left = 0.0 ; |
| 267 | do |
| 268 | { fraction = fp_to_double (filter_index) ; |
| 269 | indx = fp_to_int (filter_index) ; |
| 270 | |
| 271 | icoeff = filter->coeffs [indx] + fraction * (filter->coeffs [indx + 1] - filter->coeffs [indx]) ; |
| 272 | |
| 273 | left += icoeff * filter->buffer [data_index] ; |
| 274 | |
| 275 | filter_index -= increment ; |
| 276 | data_index = data_index + filter->channels ; |
| 277 | } |
| 278 | while (filter_index >= MAKE_INCREMENT_T (0)) ; |
| 279 | |
| 280 | /* Now apply the right half of the filter. */ |
| 281 | filter_index = increment - start_filter_index ; |
| 282 | coeff_count = (max_filter_index - filter_index) / increment ; |
| 283 | filter_index = filter_index + coeff_count * increment ; |
| 284 | data_index = filter->b_current + filter->channels * (1 + coeff_count) + ch ; |
| 285 | |
| 286 | right = 0.0 ; |
| 287 | do |
| 288 | { fraction = fp_to_double (filter_index) ; |
| 289 | indx = fp_to_int (filter_index) ; |
| 290 | |
| 291 | icoeff = filter->coeffs [indx] + fraction * (filter->coeffs [indx + 1] - filter->coeffs [indx]) ; |
| 292 | |
| 293 | right += icoeff * filter->buffer [data_index] ; |
| 294 | |
| 295 | filter_index -= increment ; |
| 296 | data_index = data_index - filter->channels ; |
| 297 | } |
| 298 | while (filter_index > MAKE_INCREMENT_T (0)) ; |
| 299 | |
| 300 | return (left + right) ; |
| 301 | } |
| 302 | #endif |