Commit 4b3d5be2 authored by Emmanuele Bassi's avatar Emmanuele Bassi

Drop the MMX assembly optimizations

We haven't built them on anything that isn't a 32bit IA platform, and
we could probably get better mileage out of the currently implemented
pixops just by rearranging the C code and letting compilers do the
optimizations for us. We should definitely consider either using pixman
directly, or replacing slow pixops with SSE builtins, instead.
parent b4acb66c
......@@ -280,76 +280,3 @@ Integer tricks for compositing
MMX Code
========
Line functions are provided in MMX functionsfor a few special
cases:
n_x = n_y = 2
src_channels = 3 dest_channels = 3 op = scale
src_channels = 4 with alpha dest_channels = 4 no alpha op = composite
src_channels = 4 with alpha dest_channels = 4 no alpha op = composite_color
For the case n_x = n_y = 2 - primarily hit when scaling up with bilinear
scaling, we can take advantage of the fact that multiple destination
pixels will be composed from the same source pixels.
That is a destination pixel is a linear combination of the source
pixels around it:
S0 S1
D D' D'' ...
S2 S3
Each mmx register is 64 bits wide, so we can unpack a source pixel
into the low 8 bits of 4 16 bit words, and store it into a mmx
register.
For each destination pixel, we first make sure that we have pixels S0
... S3 loaded into registers mm0 ...mm3. (This will often involve not
doing anything or moving mm1 and mm3 into mm0 and mm1 then reloading
mm1 and mm3 with new values).
Then we load up the appropriate weights for the 4 corner pixels
based on the offsets of the destination pixel within the source
pixels.
We have preexpanded the weights to 64 bits wide and truncated the
range to 8 bits, so an original filter value of
0x5321 would be expanded to
0x0053005300530053
For source buffers without alpha, we simply do a multiply-add
of the weights, giving us a 16 bit quantity for the result
that we shift left by 8 and store in the destination buffer.
When the source buffer has alpha, then things become more
complicated - when we load up mm0 and mm3, we premultiply
the alpha, so they contain:
(a*ff >> 8) (r*a >> 8) (g*a >> 8) (b*a >> a)
Then when we multiply by the weights, and add we end up
with premultiplied r,g,b,a in the range of 0 .. 0xff * 0ff,
call them A,R,G,B
We then need to composite with the dest pixels - which
we do by:
r_dest = (R + ((0xff * 0xff - A) >> 8) * r_dest) >> 8
(0xff * 0xff)
......@@ -138,23 +138,6 @@ TODO
switching around conditionals and inner loops in various
places.
* Right now, in several of the most common cases, there are
optimized mmx routines, but no optimized C routines.
For instance, there is a
pixops_composite_line_22_4a4_mmx()
But no
pixops_composite_line_22_4a4()
Also, it may be desirable to include a few more special cases - in particular:
pixops_composite_line_22_4a3()
May be desirable.
* Scaling down images by large scale factors is _slow_ since huge filter
matrixes are computed. (e.g., to scale down by a factor of 100, we compute
101x101 filter matrixes. At some point, it would be more efficent to
......
/*
* Copyright (C) 2000 Red Hat, Inc
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
.file "composite_line_22_4a4_mmx.S"
.version "01.01"
gcc2_compiled.:
.text
.align 16
#if !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(__INTERIX)
/* Magic indicating no need for an executable stack */
#if !defined __powerpc64__ && !defined __ia64__
.section .note.GNU-stack; .previous
#endif
.globl _pixops_composite_line_22_4a4_mmx
.type _pixops_composite_line_22_4a4_mmx,@function
_pixops_composite_line_22_4a4_mmx:
#else
.globl __pixops_composite_line_22_4a4_mmx
__pixops_composite_line_22_4a4_mmx:
#endif
/*
* Arguments
*
* weights: 8(%ebp)
* p: 12(%ebp) %esi
* q1: 16(%ebp)
* q2: 20(%ebp)
* xstep: 24(%ebp)
* p_end: 28(%ebp)
* xinit: 32(%ebp)
*
*/
/*
* Function call entry
*/
pushl %ebp
movl %esp,%ebp
subl $28,%esp
pushl %edi
pushl %esi
pushl %ebx
/* Locals:
* int x %ebx
* int x_scaled -24(%ebp)
*/
/*
* Setup
*/
/* Initialize variables */
movl 32(%ebp),%ebx
movl 32(%ebp),%edx
sarl $16,%edx
movl 12(%ebp),%esi
movl %edx,-24(%ebp)
cmpl 28(%ebp),%esi
jnb .out
/* Load initial values into %mm1, %mm3 */
shll $2, %edx
pxor %mm4, %mm4
movl 16(%ebp),%edi
movl (%edi, %edx), %eax
movd (%edi, %edx), %mm5
punpcklbw %mm4, %mm5
shrl $24, %eax
movl $0x010101, %ecx
mull %ecx
orl $0xff000000, %eax
movd %eax, %mm1
punpcklbw %mm4, %mm1
pmullw %mm5,%mm1
movl -24(%ebp),%edx
shll $2, %edx
movl 20(%ebp),%edi
movl (%edi, %edx), %eax
movd (%edi, %edx), %mm5
punpcklbw %mm4, %mm5
shrl $24, %eax
movl $0x010101, %ecx
mull %ecx
orl $0xff000000, %eax
movd %eax, %mm3
punpcklbw %mm4, %mm3
pmullw %mm5,%mm3
psrlw $8,%mm1
psrlw $8,%mm3
addl $65536,%ebx
movl %ebx,%edx
sarl $16,%edx
jmp .newx
.p2align 4,,7
.loop:
/* int x_index = (x & 0xf000) >> 12 */
movl %ebx,%eax
andl $0xf000,%eax
shrl $7,%eax
movq (%edi,%eax),%mm4
pmullw %mm0,%mm4
movq 8(%edi,%eax),%mm5
pmullw %mm1,%mm5
movq 16(%edi,%eax),%mm6
movq 24(%edi,%eax),%mm7
pmullw %mm2,%mm6
pmullw %mm3,%mm7
paddw %mm4, %mm5
paddw %mm6, %mm7
paddw %mm5, %mm7
movl $0xffff,%ecx
movd %ecx,%mm4
psllq $48,%mm4
movq %mm4,%mm6
psubw %mm7,%mm4
pand %mm6,%mm4
movq %mm4,%mm5
psrlq $16,%mm4
por %mm4,%mm5
psrlq $32,%mm5
por %mm4,%mm5
psrlw $8,%mm5
movd (%esi),%mm7
pxor %mm4,%mm4
punpcklbw %mm4, %mm7
pmullw %mm7,%mm5
/* x += x_step; */
addl 24(%ebp),%ebx
/* x_scale = x >> 16; */
movl %ebx,%edx
sarl $16,%edx
paddw %mm5,%mm6
psrlw $8,%mm6
packuswb %mm6, %mm6
movd %mm6,(%esi)
addl $4, %esi
cmpl %esi,28(%ebp)
je .out
cmpl %edx,-24(%ebp)
je .loop
.newx:
movl %edx,-24(%ebp)
/*
* Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2
*/
movq %mm1, %mm0
movq %mm3, %mm2
shll $2, %edx
/* # %mm4 will always be already clear here */
/* # pxor %mm4, %mm4 */
movl 16(%ebp),%edi
movl (%edi, %edx), %eax
movd (%edi, %edx), %mm5
punpcklbw %mm4, %mm5
shrl $24, %eax
movl $0x010101, %ecx
mull %ecx
/*
* mull destroyed %edx, need to reconstitute
*/
movl -24(%ebp),%edx
shll $2, %edx
orl $0xff000000, %eax
movd %eax, %mm1
punpcklbw %mm4, %mm1
pmullw %mm5,%mm1
movl 20(%ebp),%edi
movl (%edi, %edx), %eax
movd (%edi, %edx), %mm5
punpcklbw %mm4, %mm5
shrl $24, %eax
movl $0x010101, %ecx
mull %ecx
orl $0xff000000, %eax
movd %eax, %mm3
punpcklbw %mm4, %mm3
pmullw %mm5,%mm3
psrlw $8,%mm1
psrlw $8,%mm3
movl 8(%ebp),%edi
jmp .loop
.out:
movl %esi,%eax
emms
leal -40(%ebp),%esp
popl %ebx
popl %esi
popl %edi
movl %ebp,%esp
popl %ebp
ret
/*
* Copyright (C) 2000 Red Hat, Inc
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
.file "composite_line_color_22_4a4_mmx.S"
.version "01.01"
gcc2_compiled.:
.text
.align 16
#if !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(__INTERIX)
/* Magic indicating no need for an executable stack */
#if !defined __powerpc64__ && !defined __ia64__
.section .note.GNU-stack; .previous
#endif
.globl _pixops_composite_line_color_22_4a4_mmx
.type _pixops_composite_line_color_22_4a4_mmx,@function
_pixops_composite_line_color_22_4a4_mmx:
#else
.globl __pixops_composite_line_color_22_4a4_mmx
__pixops_composite_line_color_22_4a4_mmx:
#endif
/*
* Arguments
*
* weights: 8(%ebp)
* p: 12(%ebp) %esi
* q1: 16(%ebp)
* q2: 20(%ebp)
* xstep: 24(%ebp)
* p_end: 28(%ebp)
* xinit: 32(%ebp)
* dest_x: 36(%ebp)
* check_shift: 40(%ebp)
* colors: 44(%ebp)
*
*/
/*
* Function call entry
*/
pushl %ebp
movl %esp,%ebp
subl $28,%esp
pushl %edi
pushl %esi
pushl %ebx
/* Locals:
* int x %ebx
* int x_scaled -24(%ebp)
*/
/*
* Setup
*/
/* Initialize variables */
movl 32(%ebp),%ebx
movl 32(%ebp),%edx
sarl $16,%edx
movl 12(%ebp),%esi
movl %edx,-24(%ebp)
cmpl 28(%ebp),%esi
jnb .out
/* Load initial values into %mm1, %mm3 */
shll $2, %edx
pxor %mm4, %mm4
movl 16(%ebp),%edi
movl (%edi, %edx), %eax
movd (%edi, %edx), %mm5
punpcklbw %mm4, %mm5
shrl $24, %eax
movl $0x010101, %ecx
mull %ecx
orl $0xff000000, %eax
movd %eax, %mm1
punpcklbw %mm4, %mm1
pmullw %mm5,%mm1
/*
* mull destroyed %edx, need to reconstitute
*/
movl -24(%ebp),%edx
shll $2, %edx
movl 20(%ebp),%edi
movl (%edi, %edx), %eax
movd (%edi, %edx), %mm5
punpcklbw %mm4, %mm5
shrl $24, %eax
movl $0x010101, %ecx
mull %ecx
orl $0xff000000, %eax
movd %eax, %mm3
punpcklbw %mm4, %mm3
pmullw %mm5,%mm3
psrlw $8,%mm1
psrlw $8,%mm3
addl $65536,%ebx
movl %ebx,%edx
sarl $16,%edx
jmp .newx
.p2align 4,,7
.loop:
/* int x_index = (x & 0xf000) >> 12 */
movl %ebx,%eax
andl $0xf000,%eax
shrl $7,%eax
movq (%edi,%eax),%mm4
pmullw %mm0,%mm4
movq 8(%edi,%eax),%mm5
pmullw %mm1,%mm5
movq 16(%edi,%eax),%mm6
movq 24(%edi,%eax),%mm7
pmullw %mm2,%mm6
pmullw %mm3,%mm7
paddw %mm4, %mm5
paddw %mm6, %mm7
paddw %mm5, %mm7
movl $0xffff,%ecx
movd %ecx,%mm4
psllq $48,%mm4
movq %mm4,%mm6
psubw %mm7,%mm4
pand %mm6,%mm4
movq %mm4,%mm5
psrlq $16,%mm4
por %mm4,%mm5
psrlq $32,%mm5
por %mm4,%mm5
psrlw $8,%mm5
movl 36(%ebp),%eax
incl 36(%ebp)
movl 40(%ebp),%ecx
shrl %cl,%eax
andl $1,%eax
movl 44(%ebp),%ecx
movq (%ecx,%eax,8),%mm6
pmullw %mm6,%mm5
/* x += x_step; */
addl 24(%ebp),%ebx
/* x_scale = x >> 16; */
movl %ebx,%edx
sarl $16,%edx
paddw %mm5,%mm7
psrlw $8,%mm7
packuswb %mm7, %mm7
movd %mm7,(%esi)
addl $4, %esi
cmpl %esi,28(%ebp)
je .out
cmpl %edx,-24(%ebp)
je .loop
.newx:
movl %edx,-24(%ebp)
/*
* Load the two new values into %mm1, %mm3, move old values into %mm0, %mm2
*/
movq %mm1, %mm0
movq %mm3, %mm2
shll $2, %edx
pxor %mm4, %mm4
movl 16(%ebp),%edi
movl (%edi, %edx), %eax
movd (%edi, %edx), %mm5
punpcklbw %mm4, %mm5
shrl $24, %eax
movl $0x010101, %ecx
mull %ecx
/*
* mull destroyed %edx, need to reconstitute
*/
movl -24(%ebp),%edx
shll $2, %edx
orl $0xff000000, %eax
movd %eax, %mm1
punpcklbw %mm4, %mm1
pmullw %mm5,%mm1
movl 20(%ebp),%edi
movl (%edi, %edx), %eax
movd (%edi, %edx), %mm5
punpcklbw %mm4, %mm5
shrl $24, %eax
movl $0x010101, %ecx
mull %ecx
orl $0xff000000, %eax
movd %eax, %mm3
punpcklbw %mm4, %mm3
pmullw %mm5,%mm3
psrlw $8,%mm1
psrlw $8,%mm3
movl 8(%ebp),%edi
jmp .loop
.out:
movl %esi,%eax
emms
leal -40(%ebp),%esp
popl %ebx
popl %esi
popl %edi
movl %ebp,%esp
popl %ebp
ret
/*
* Copyright (C) 2000 Red Hat, Inc
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
.file "have_mmx.S"
.version "01.01"
gcc2_compiled.:
.text
.align 16
#if !defined(__MINGW32__) && !defined(__CYGWIN__) && !defined(__INTERIX)
/* Magic indicating no need for an executable stack */
#if !defined __powerpc64__ && !defined __ia64__
.section .note.GNU-stack; .previous
#endif
.globl _pixops_have_mmx
.type _pixops_have_mmx,@function
_pixops_have_mmx:
#else
.globl __pixops_have_mmx
__pixops_have_mmx:
#endif
push %ebx
/* # Check if bit 21 in flags word is writeable */
pushfl
popl %eax
movl %eax,%ebx
xorl $0x00200000, %eax
pushl %eax
popfl
pushfl
popl %eax
cmpl %eax, %ebx
je .notfound
/* # OK, we have CPUID */
movl $1, %eax
cpuid
test $0x00800000, %edx
jz .notfound
movl $1, %eax
jmp .out
.notfound:
movl $0, %eax
.out:
popl %ebx
ret
/*
* Copyright (C) 2000 Red Hat, Inc
*
* This library is free software; you can redistribute it and/or
* modify it under the terms of the GNU Lesser General Public
* License as published by the Free Software Foundation; either
* version 2 of the License, or (at your option) any later version.
*
* This library is distributed in the hope that it will be useful,
* but WITHOUT ANY WARRANTY; without even the implied warranty of
* MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the GNU
* Lesser General Public License for more details.
*
* You should have received a copy of the GNU Lesser General Public
* License along with this library; if not, see <http://www.gnu.org/licenses/>.
*/
#ifdef USE_MMX
guchar *_pixops_scale_line_22_33_mmx (guint32 weights[16][8], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init);
guchar *_pixops_composite_line_22_4a4_mmx (guint32 weights[16][8], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init);
guchar *_pixops_composite_line_color_22_4a4_mmx (guint32 weights[16][8], guchar *p, guchar *q1, guchar *q2, int x_step, guchar *p_stop, int x_init, int dest_x, int check_shift, int *colors);
int _pixops_have_mmx (void);
#endif
......@@ -23,7 +23,6 @@
#include "../fallback-c89.c"
#include "pixops.h"
#include "pixops-internal.h"
#define SUBSAMPLE_BITS 4
#define SUBSAMPLE (1 << SUBSAMPLE_BITS)
......@@ -837,36 +836,6 @@ composite_line_22_4a4 (int *weights, int n_x, int n_y,
return dest;
}
#ifdef USE_MMX
static guchar *
composite_line_22_4a4_mmx_stub (int *weights, int n_x, int n_y, guchar *dest,
int dest_x, guchar *dest_end,
int dest_channels, int dest_has_alpha,
guchar **src, int src_channels,
gboolean src_has_alpha, int x_init,
int x_step, int src_width, int check_size,
guint32 color1, guint32 color2)
{
guint32 mmx_weights[16][8];
int j;
for (j=0; j<16; j++)
{
mmx_weights[j][0] = 0x00010001 * (weights[4*j] >> 8);
mmx_weights[j][1] = 0x00010001 * (weights[4*j] >> 8);
mmx_weights[j][2] = 0x00010001 * (weights[4*j + 1] >> 8);
mmx_weights[j][3] = 0x00010001 * (weights[4*j + 1] >> 8);
mmx_weights[j][4] = 0x00010001 * (weights[4*j + 2] >> 8);
mmx_weights[j][5] = 0x00010001 * (weights[4*j + 2] >> 8);
mmx_weights[j][6] = 0x00010001 * (weights[4*j + 3] >> 8);
mmx_weights[j][7] = 0x00010001 * (weights[4*j + 3] >> 8);
}
return _pixops_composite_line_22_4a4_mmx (mmx_weights, dest, src[0], src[1],
x_step, dest_end, x_init);
}
#endif /* USE_MMX */
static void
composite_pixel_color (guchar *dest, int dest_x, int dest_channels,
int dest_has_alpha, int src_has_alpha, int check_size,
......@@ -980,44 +949,6 @@ composite_line_color (int *weights, int n_x, int n_y, guchar *dest,
return dest;
}
#ifdef USE_MMX
static guchar *
composite_line_color_22_4a4_mmx_stub (int *weights, int n_x, int n_y,
guchar *dest, int dest_x,
guchar *dest_end, int dest_channels,
int dest_has_alpha, guchar **src,
int src_channels, gboolean src_has_alpha,
int x_init, int x_step, int src_width,
int check_size, guint32 color1,
guint32 color2)
{
guint32 mmx_weights[16][8];
int check_shift = get_check_shift (check_size);
int colors[4];
int j;
for (j=0; j<16; j++)
{
mmx_weights[j][0] = 0x00010001 * (weights[4*j] >> 8);
mmx_weights[j][1] = 0x00010001 * (weights[4*j] >> 8);