gimp-composite-mmx.c 65.8 KB
Newer Older
1
/* -*- mode: c tab-width: 2; c-basic-indent: 2; indent-tabs-mode: nil -*-
2 3
 *
 * The GIMP -- an image manipulation program
Helvetix Victorinox's avatar
Helvetix Victorinox committed
4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19
 * Copyright (C) 1995 Spencer Kimball and Peter Mattis
 *
 *
 * This program is free software; you can redistribute it and/or modify
 * it under the terms of the GNU General Public License as published by
 * the Free Software Foundation; either version 2 of the License, or
 * (at your option) any later version.
 *
 * This program is distributed in the hope that it will be useful,
 * but WITHOUT ANY WARRANTY; without even the implied warranty of
 * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.  See the
 * GNU General Public License for more details.
 *
 * You should have received a copy of the GNU General Public License
 * along with this program; if not, write to the Free Software
 * Foundation, Inc., 59 Temple Place - Suite 330, Boston, MA 02111-1307, USA.
20 21 22
 */

/* Much of the content of this file are derivative works of David
Helvetix Victorinox's avatar
Helvetix Victorinox committed
23 24 25 26 27 28 29 30 31
 * Monniaux which are Copyright (C) 1999, 2001 David Monniaux
 * Tip-o-the-hat to David for pioneering this effort.
 *
 * All of these functions use the mmx registers and expect them to
 * remain intact across multiple asm() constructs.  This may not work
 * in the future, if the compiler allocates mmx registers for it's own
 * use. XXX
 */

32 33
#include "config.h"

Helvetix Victorinox's avatar
Helvetix Victorinox committed
34
#include <stdio.h>
35 36 37 38

#include <glib-object.h>

#include "base/base-types.h"
39
#include "base/cpu-accel.h"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
40 41 42

#include "gimp-composite.h"
#include "gimp-composite-mmx.h"
43
#include "gimp-composite-x86.h"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
44

45 46 47 48
#if defined(USE_MMX)
#if defined(ARCH_X86)
#if __GNUC__ >= 3

Helvetix Victorinox's avatar
Helvetix Victorinox committed
49 50
#define pminub(src,dst,tmp)  "\tmovq %%" #dst ", %%" #tmp ";" "psubusb %%" #src ", %%" #tmp ";" "psubb %%" #tmp ", %%" #dst "\n"
#define pmaxub(a,b,tmp)      "\tmovq %%" #a ", %%" #tmp ";" "psubusb %%" #b ", %%" #tmp ";" "paddb %%" #tmp ", %%" #b "\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
51 52


53

Helvetix Victorinox's avatar
Helvetix Victorinox committed
54
void
55
debug_display_mmx(void)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
56 57 58 59 60 61 62 63 64 65 66
{
#define mask32(x) ((x)& (unsigned long long) 0xFFFFFFFF)
#define print64(reg) { unsigned long long reg; asm("movq %%" #reg ",%0" : "=m" (reg)); printf(#reg"=%08llx %08llx", mask32(reg>>32), mask32(reg)); }
  printf("--------------------------------------------\n");
  print64(mm0); printf("  "); print64(mm1); printf("\n");
  print64(mm2); printf("  "); print64(mm3); printf("\n");
  print64(mm4); printf("  "); print64(mm5); printf("\n");
  print64(mm6); printf("  "); print64(mm7); printf("\n");
  printf("--------------------------------------------\n");
}

67 68 69 70 71 72 73 74
static const guint32 rgba8_alpha_mask[2] = { 0xFF000000, 0xFF000000 };
static const guint32 rgba8_b1[2] =         { 0x01010101, 0x01010101 };
static const guint32 rgba8_b255[2] =       { 0xFFFFFFFF, 0xFFFFFFFF };
static const guint32 rgba8_w1[2] =         { 0x00010001, 0x00010001 };
static const guint32 rgba8_w2[2] =         { 0x00020002, 0x00020002 };
static const guint32 rgba8_w128[2] =       { 0x00800080, 0x00800080 };
static const guint32 rgba8_w256[2] =       { 0x01000100, 0x01000100 };
static const guint32 rgba8_w255[2] =       { 0X00FF00FF, 0X00FF00FF };
75

76 77 78 79
static const guint32 va8_alpha_mask[2] =   { 0xFF00FF00, 0xFF00FF00 };
static const guint32 va8_b255[2] =         { 0xFFFFFFFF, 0xFFFFFFFF };
static const guint32 va8_w1[2] =           { 0x00010001, 0x00010001 };
static const guint32 va8_w255[2] =         { 0x00FF00FF, 0x00FF00FF };
Tor Lillqvist's avatar
Update.  
Tor Lillqvist committed
80

Helvetix Victorinox's avatar
Helvetix Victorinox committed
81 82 83 84
/*
 *
 */
void
85
gimp_composite_addition_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
86 87 88
{
  GimpCompositeContext op = *_op;

89
  asm volatile ("movq    %0,%%mm0"
90 91 92 93 94 95
                : /* empty */
                : "m" (*rgba8_alpha_mask)
                : "%mm0");

  for (; op.n_pixels >= 2; op.n_pixels -= 2)
    {
96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112
      asm ("  movq    %0, %%mm2\n"
           "\tmovq    %1, %%mm3\n"
           "\tmovq    %%mm2, %%mm4\n"
           "\tpaddusb %%mm3, %%mm4\n"
           "\tmovq    %%mm0, %%mm1\n"
           "\tpandn   %%mm4, %%mm1\n"
           "\t" pminub(mm3, mm2, mm4) "\n"
           "\tpand    %%mm0, %%mm2\n"
           "\tpor     %%mm2, %%mm1\n"
           "\tmovq    %%mm1, %2\n"
           : /* empty */
           : "m" (*op.A), "m" (*op.B), "m" (*op.D)
           : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
113 114 115

  if (op.n_pixels)
    {
116 117
      asm volatile ("  movd    %0, %%mm2\n"
                    "\tmovd    %1, %%mm3\n"
118 119 120 121 122 123 124 125 126 127 128 129
                    "\tmovq    %%mm2, %%mm4\n"
                    "\tpaddusb %%mm3, %%mm4\n"
                    "\tmovq    %%mm0, %%mm1\n"
                    "\tpandn   %%mm4, %%mm1\n"
                    "\t" pminub(mm3, mm2, mm4) "\n"
                    "\tpand    %%mm0, %%mm2\n"
                    "\tpor     %%mm2, %%mm1\n"
                    "\tmovd    %%mm1, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
130 131 132 133

  asm("emms");
}

134 135
void
gimp_composite_burn_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
136 137 138
{
  GimpCompositeContext op = *_op;

139
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
140 141 142
    {
      asm ("  movq         %0,%%mm0\n"
           "\tmovq         %1,%%mm1\n"
143

144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172
           "\tmovq         %3,%%mm2\n"
           "\tpsubb     %%mm0,%%mm2\n" /* mm2 = 255 - A */
           "\tpxor      %%mm4,%%mm4\n"
           "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */

           "\tmovq      %%mm1,%%mm3\n"
           "\tpxor      %%mm5,%%mm5\n"
           "\tpunpcklbw %%mm5,%%mm3\n"
           "\tmovq         %4,%%mm5\n"
           "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */

           "\t" pdivwqX(mm4,mm5,mm7) "\n"

           "\tmovq         %3,%%mm2\n"
           "\tpsubb     %%mm0,%%mm2\n" /* mm2 = 255 - A */
           "\tpxor      %%mm4,%%mm4\n"
           "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */

           "\tmovq      %%mm1,%%mm3\n"
           "\tpxor      %%mm5,%%mm5\n"
           "\tpunpckhbw %%mm5,%%mm3\n"
           "\tmovq         %4,%%mm5\n"
           "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
           "\t" pdivwqX(mm4,mm5,mm6) "\n"

           "\tmovq      %5,%%mm4\n"
           "\tmovq      %%mm4,%%mm5\n"
           "\tpsubusw   %%mm6,%%mm4\n"
           "\tpsubusw   %%mm7,%%mm5\n"
173

174
           "\tpackuswb  %%mm4,%%mm5\n"
175

176
           "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
177

178 179
           "\tmovq         %6,%%mm7\n"
           "\tpand      %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
180

181 182
           "\tpandn     %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
           "\tpor       %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
183

184 185 186 187 188 189 190 191
           "\tmovq      %%mm7,%2\n"
           : /* empty */
           : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask)
           : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
192 193

  if (op.n_pixels)
194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226
    {
      asm volatile ("  movd      %0,%%mm0\n"
                    "\tmovd      %1,%%mm1\n"

                    "\tmovq      %3,%%mm2\n"
                    "\tpsubb     %%mm0,%%mm2\n" /* mm2 = 255 - A */
                    "\tpxor      %%mm4,%%mm4\n"
                    "\tpunpcklbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */

                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm5,%%mm5\n"
                    "\tpunpcklbw %%mm5,%%mm3\n"
                    "\tmovq      %4,%%mm5\n"
                    "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */

                    "\t" pdivwqX(mm4,mm5,mm7) "\n"

                    "\tmovq      %3,%%mm2\n"
                    "\tpsubb   %%mm0,%%mm2\n" /* mm2 = 255 - A */
                    "\tpxor      %%mm4,%%mm4\n"
                    "\tpunpckhbw %%mm2,%%mm4\n" /* mm4 = (255- A) * 256  */

                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm5,%%mm5\n"
                    "\tpunpckhbw %%mm5,%%mm3\n"
                    "\tmovq      %4,%%mm5\n"
                    "\tpaddusw   %%mm3,%%mm5\n" /* mm5 = B + 1 */
                    "\t" pdivwqX(mm4,mm5,mm6) "\n"

                    "\tmovq      %5,%%mm4\n"
                    "\tmovq      %%mm4,%%mm5\n"
                    "\tpsubusw     %%mm6,%%mm4\n"
                    "\tpsubusw     %%mm7,%%mm5\n"
227

228
                    "\tpackuswb  %%mm4,%%mm5\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
229

230
                    "\t" pminub(mm0,mm1,mm3) "\n" /* mm1 = min(mm0,mm1) clobber mm3 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
231

232 233
                    "\tmovq      %6,%%mm7\n"
                    "\tpand      %%mm7,%%mm1\n" /* mm1 = mm7 & alpha_mask */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
234

235 236
                    "\tpandn     %%mm5,%%mm7\n" /* mm7 = ~mm7 & mm5 */
                    "\tpor       %%mm1,%%mm7\n" /* mm7 = mm7 | mm1 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
237

238 239 240 241 242
                    "\tmovd      %%mm7,%2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_b255), "m" (*rgba8_w1), "m" (*rgba8_w255), "m" (*rgba8_alpha_mask)
                    : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
243 244 245 246 247 248

  asm("emms");
}


void
249
gimp_composite_darken_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
250 251 252
{
  GimpCompositeContext op = *_op;

253
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
254
    {
255 256 257 258 259 260 261 262 263 264 265
      asm volatile ("  movq    %1, %%mm2\n"
                    "\tmovq    %2, %%mm3\n"
                    "\t" pminub(mm3, mm2, mm4) "\n"
                    "\tmovq    %%mm2, %0\n"
                    : "=m" (*op.D)
                    : "m" (*op.A), "m" (*op.B)
                    : "%mm1", "%mm2", "%mm3", "%mm4");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
266

267 268
  if (op.n_pixels)
    {
269 270 271 272 273 274 275 276
      asm volatile ("  movd    %0, %%mm2\n"
                    "\tmovd    %1, %%mm3\n"
                    "\t" pminub(mm3, mm2, mm4) "\n"
                    "\tmovd    %%mm2, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm2", "%mm3", "%mm4");
    }
277

Helvetix Victorinox's avatar
Helvetix Victorinox committed
278 279 280 281
  asm("emms");
}

void
282
gimp_composite_difference_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
283 284 285
{
  GimpCompositeContext op = *_op;

286 287 288
  asm volatile ("movq    %0,%%mm0"     :  : "m" (*rgba8_alpha_mask) : "%mm0");

  for (; op.n_pixels >= 2; op.n_pixels -= 2)
289 290 291 292 293 294 295 296 297 298
    {
      asm volatile ("  movq       %0, %%mm2\n"
                    "\tmovq       %1, %%mm3\n"
                    "\tmovq    %%mm2, %%mm4\n"
                    "\tmovq    %%mm3, %%mm5\n"
                    "\tpsubusb %%mm3, %%mm4\n"
                    "\tpsubusb %%mm2, %%mm5\n"
                    "\tpaddb   %%mm5, %%mm4\n"
                    "\tmovq    %%mm0, %%mm1\n"
                    "\tpandn   %%mm4, %%mm1\n"
299
                    "\t" pminub(mm3,mm2,mm4) "\n"
300 301 302 303 304 305 306 307 308 309
                    "\tpand    %%mm0, %%mm2\n"
                    "\tpor     %%mm2, %%mm1\n"
                    "\tmovq    %%mm1, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
310

311
  if (op.n_pixels)
312 313 314 315 316 317 318 319 320 321
    {
      asm volatile ("  movd       %0, %%mm2\n"
                    "\tmovd       %1, %%mm3\n"
                    "\tmovq    %%mm2, %%mm4\n"
                    "\tmovq    %%mm3, %%mm5\n"
                    "\tpsubusb %%mm3, %%mm4\n"
                    "\tpsubusb %%mm2, %%mm5\n"
                    "\tpaddb   %%mm5, %%mm4\n"
                    "\tmovq    %%mm0, %%mm1\n"
                    "\tpandn   %%mm4, %%mm1\n"
322
                    "\t" pminub(mm3,mm2,mm4) "\n"
323 324 325 326 327 328 329
                    "\tpand    %%mm0, %%mm2\n"
                    "\tpor     %%mm2, %%mm1\n"
                    "\tmovd    %%mm1, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
330 331 332 333 334

  asm("emms");
}

void
335
xxxgimp_composite_divide_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
336 337 338
{
  GimpCompositeContext op = *_op;

339 340 341
  printf("A=%d B=%d %d  ", op.pixelformat_A, op.pixelformat_B, GIMP_PIXELFORMAT_RGBA8); fflush(stdout);

  asm volatile ("  movq    %0, %%mm0\n"
342 343 344 345
                "\tmovq    %1, %%mm7\n"
                :
                : "m" (*rgba8_alpha_mask), "m" (*rgba8_w1)
                : "%mm0", "%mm7");
346

347
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
348 349 350 351 352
    {
      asm volatile ("  movq         %0,%%mm0\n"
                    "\tmovq         %1,%%mm1\n"
                    "\tpxor      %%mm2,%%mm2\n"
                    "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
353

354 355 356 357
                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm5,%%mm5\n"
                    "\tpunpcklbw %%mm5,%%mm3\n"
                    "\tpaddw     %%mm7,%%mm3\n" /* mm3 = B+1 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
358

359
                    "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
360

361 362
                    "\tpxor      %%mm2,%%mm2\n"
                    "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
363

364 365 366 367
                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm6,%%mm6\n"
                    "\tpunpckhbw %%mm6,%%mm3\n"
                    "\tpaddw     %%mm7,%%mm3\n" /* mm3 = B+1 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
368

369
                    "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
370

371
                    "\tpackuswb  %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
372

373 374 375
                    "\t" pminub(mm0,mm1,mm3) "\n"
                    "\tmovq      %3,%%mm3\n"
                    "\tmovq      %%mm3,%%mm2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
376

377
                    "\tpandn     %%mm5,%%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
378

379 380
                    "\tpand      %%mm2,%%mm1\n"
                    "\tpor       %%mm1,%%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
381

382 383 384 385 386 387 388 389
                    "\tmovq      %%mm3,%2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask)
                    : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
390

391
  if (op.n_pixels)
392 393 394 395 396
    {
      asm volatile ("  movd         %0,%%mm0\n"
                    "\tmovd         %1,%%mm1\n"
                    "\tpxor      %%mm2,%%mm2\n"
                    "\tpunpcklbw %%mm0,%%mm2\n" /* mm2 = A*256 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
397

398 399 400 401
                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm5,%%mm5\n"
                    "\tpunpcklbw %%mm5,%%mm3\n"
                    "\tpaddw     %%mm7,%%mm3\n" /* mm3 = B+1 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
402

403
                    "\t" pdivwuqX(mm2,mm3,mm5) "\n" /* mm5 = (A*256)/(B+1) */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
404

405 406
                    "\tpxor      %%mm2,%%mm2\n"
                    "\tpunpckhbw %%mm0,%%mm2\n" /* mm2 = A*256 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
407

408 409 410 411
                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm6,%%mm6\n"
                    "\tpunpckhbw %%mm6,%%mm3\n"
                    "\tpaddw     %%mm7,%%mm3\n" /* mm3 = B+1 */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
412

413
                    "\t" pdivwuqX(mm2,mm3,mm4) "\n" /* mm4 = (A*256)/(B+1) */
414

415
                    "\tpackuswb  %%mm4,%%mm5\n" /* expects mm4 and mm5 to be signed values */
Helvetix Victorinox's avatar
Helvetix Victorinox committed
416

417 418 419
                    "\t" pminub(mm0,mm1,mm3) "\n"
                    "\tmovq         %3,%%mm3\n"
                    "\tmovq      %%mm3,%%mm2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
420

421
                    "\tpandn     %%mm5,%%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
422

423 424
                    "\tpand      %%mm2,%%mm1\n"
                    "\tpor       %%mm1,%%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
425

426 427 428 429 430
                    "\tmovd      %%mm3,%2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_alpha_mask)
                    : "%eax", "%ecx", "%edx", "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
431 432 433 434 435

  asm("emms");
}

void
436
gimp_composite_dodge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
437 438 439
{
  GimpCompositeContext op = *_op;

440
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
441 442 443 444 445 446 447
    {
      asm volatile ("  movq         %0,%%mm0\n"
                    "\tmovq         %1,%%mm1\n"
                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm2,%%mm2\n"
                    "\tpunpcklbw %%mm2,%%mm3\n"
                    "\tpunpcklbw %%mm0,%%mm2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
448

449 450
                    "\tmovq         %3,%%mm4\n"
                    "\tpsubw     %%mm3,%%mm4\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
451

452
                    "\t" pdivwuqX(mm2,mm4,mm5) "\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
453

454 455 456 457
                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm2,%%mm2\n"
                    "\tpunpckhbw %%mm2,%%mm3\n"
                    "\tpunpckhbw %%mm0,%%mm2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
458

459 460
                    "\tmovq         %3,%%mm4\n"
                    "\tpsubw     %%mm3,%%mm4\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
461

462
                    "\t" pdivwuqX(mm2,mm4,mm6) "\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
463

464
                    "\tpackuswb  %%mm6,%%mm5\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
465

466 467 468 469 470
                    "\tmovq         %4,%%mm6\n"
                    "\tmovq      %%mm1,%%mm7\n"
                    "\t" pminub(mm0,mm7,mm2) "\n"
                    "\tpand      %%mm6,%%mm7\n"
                    "\tpandn     %%mm5,%%mm6\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
471

472
                    "\tpor       %%mm6,%%mm7\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
473

474 475 476 477 478 479 480 481
                    "\tmovq      %%mm7,%2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask)
                    : "0", "1", "2", "%mm0", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
482

483
  if (op.n_pixels)
484 485 486 487 488 489 490
    {
      asm volatile ("  movd         %0,%%mm0\n"
                    "\tmovq         %1,%%mm1\n"
                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm2,%%mm2\n"
                    "\tpunpcklbw %%mm2,%%mm3\n"
                    "\tpunpcklbw %%mm0,%%mm2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
491

492 493
                    "\tmovq         %3,%%mm4\n"
                    "\tpsubw     %%mm3,%%mm4\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
494

495
                    "\t" pdivwuqX(mm2,mm4,mm5) "\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
496

497 498 499 500
                    "\tmovq      %%mm1,%%mm3\n"
                    "\tpxor      %%mm2,%%mm2\n"
                    "\tpunpckhbw %%mm2,%%mm3\n"
                    "\tpunpckhbw %%mm0,%%mm2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
501

502 503
                    "\tmovq         %3,%%mm4\n"
                    "\tpsubw     %%mm3,%%mm4\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
504

505
                    "\t" pdivwuqX(mm2,mm4,mm6) "\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
506

507
                    "\tpackuswb  %%mm6,%%mm5\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
508

509 510 511 512 513
                    "\tmovq         %4,%%mm6\n"
                    "\tmovq      %%mm1,%%mm7\n"
                    "\t" pminub(mm0,mm7,mm2) "\n"
                    "\tpand      %%mm6,%%mm7\n"
                    "\tpandn     %%mm5,%%mm6\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
514

515
                    "\tpor       %%mm6,%%mm7\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
516

517 518 519 520 521
                    "\tmovd      %%mm7,%2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w256), "m" (*rgba8_alpha_mask)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
522 523 524 525 526

  asm("emms");
}

void
527
gimp_composite_grain_extract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
528 529 530
{
  GimpCompositeContext op = *_op;

531
  asm volatile ("movq    %0,%%mm0"     :  : "m" (*rgba8_alpha_mask) : "%mm0");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
532
  asm volatile ("pxor    %%mm6,%%mm6"  :  :                        : "%mm6");
533
  asm volatile ("movq    %0,%%mm7"     :  : "m" (*rgba8_w128)       : "%mm7");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
534

535
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555
    {
      asm volatile ("  movq         %0,%%mm2\n"
                    "\tmovq         %1,%%mm3\n"
                    mmx_low_bytes_to_words(mm2,mm4,mm6)
                    mmx_low_bytes_to_words(mm3,mm5,mm6)
                    "\tpsubw     %%mm5,%%mm4\n"
                    "\tpaddw     %%mm7,%%mm4\n"
                    "\tmovq      %%mm4,%%mm1\n"

                    mmx_high_bytes_to_words(mm2,mm4,mm6)
                    mmx_high_bytes_to_words(mm3,mm5,mm6)

                    "\tpsubw     %%mm5,%%mm4\n"
                    "\tpaddw     %%mm7,%%mm4\n"

                    "\tpackuswb  %%mm4,%%mm1\n"
                    "\tmovq      %%mm1,%%mm4\n"

                    "\tmovq      %%mm0,%%mm1\n"
                    "\tpandn     %%mm4,%%mm1\n"
556

557 558 559 560 561 562 563 564 565 566 567 568
                    "\t" pminub(mm3,mm2,mm4) "\n"
                    "\tpand      %%mm0,%%mm2\n"

                    "\tpor       %%mm2,%%mm1\n"
                    "\tmovq      %%mm1,%2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
569

570
  if (op.n_pixels)
571 572 573
    {
      asm volatile ("  movd    %0, %%mm2\n"
                    "\tmovd    %1, %%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
574

575 576
                    mmx_low_bytes_to_words(mm2,mm4,mm6)
                    mmx_low_bytes_to_words(mm3,mm5,mm6)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
577

578 579 580
                    "\tpsubw     %%mm5, %%mm4\n"
                    "\tpaddw     %%mm7, %%mm4\n"
                    "\tmovq      %%mm4, %%mm1\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
581

582
                    "\tpackuswb  %%mm6, %%mm1\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
583

584
                    "\tmovq      %%mm1, %%mm4\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
585

586
                    "\tmovq      %%mm0, %%mm1; pandn     %%mm4, %%mm1\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
587

588 589
                    "\t" pminub(mm3,mm2,mm4) "\n"
                    "\tpand      %%mm0, %%mm2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
590

591 592 593 594 595 596
                    "\tpor       %%mm2, %%mm1\n"
                    "\tmovd      %%mm1, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
597 598 599 600 601

  asm("emms");
}

void
602
gimp_composite_grain_merge_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
603 604 605
{
  GimpCompositeContext op = *_op;

Helvetix Victorinox's avatar
Helvetix Victorinox committed
606
  asm volatile ("movq    %0, %%mm0\n"
607 608
                "pxor    %%mm6, %%mm6\n"
                "movq    %1, %%mm7\n"
609
                : /* empty */
610 611
                : "m" (*rgba8_alpha_mask), "m" (*rgba8_w128)
                : "%mm0", "%mm6", "%mm7");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
612

613
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
614 615 616
    {
      asm volatile ("  movq    %0, %%mm2\n"
                    "\tmovq    %1, %%mm3\n"
617

618 619 620 621
                    mmx_low_bytes_to_words(mm2,mm4,mm6)
                    mmx_low_bytes_to_words(mm3,mm5,mm6)
                    "\tpaddw     %%mm5, %%mm4\n"
                    "\tpsubw     %%mm7, %%mm4\n"
622

623 624 625 626
                    mmx_high_bytes_to_words(mm2,mm1,mm6)
                    mmx_high_bytes_to_words(mm3,mm5,mm6)
                    "\tpaddw     %%mm5, %%mm1\n"
                    "\tpsubw     %%mm7, %%mm1\n"
627

628
                    "\tpackuswb  %%mm1, %%mm4\n"
629

630 631 632 633 634 635 636 637 638 639 640 641 642 643
                    "\t" pminub(mm3,mm2,mm5) "\n"
                    "\tpand      %%mm0, %%mm2\n"

                    "\tmovq      %%mm0, %%mm1\n"
                    "\tpandn     %%mm4, %%mm1\n"
                    "\tpor       %%mm2, %%mm1\n"
                    "\tmovq      %%mm1, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
644

645
  if (op.n_pixels)
646 647 648 649 650 651
    {
      asm volatile ("  movd    %0, %%mm2\n"
                    "\tmovd    %1, %%mm3\n"

                    mmx_low_bytes_to_words(mm2,mm4,mm6)
                    mmx_low_bytes_to_words(mm3,mm5,mm6)
652

653 654 655 656
                    "\tpaddw     %%mm5, %%mm4\n"
                    "\tpsubw     %%mm7, %%mm4\n"
                    "\tmovq      %%mm4, %%mm1\n"
                    "\tpackuswb  %%mm6, %%mm1\n"
657

658
                    "\tmovq      %%mm1, %%mm4\n"
659

660
                    "\tmovq      %%mm0, %%mm1; pandn     %%mm4, %%mm1\n"
661

662 663
                    "\t" pminub(mm3,mm2,mm4) "\n"
                    "\tpand      %%mm0, %%mm2\n"
664

665 666 667 668 669 670
                    "\tpor       %%mm2, %%mm1\n"
                    "\tmovd      %%mm1, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
671 672 673 674 675

  asm("emms");
}

void
676
gimp_composite_lighten_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
677 678 679
{
  GimpCompositeContext op = *_op;

680
  asm volatile ("movq    %0,%%mm0"     :  : "m" (*rgba8_alpha_mask) : "%mm0");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
681

682
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
683 684 685 686 687 688 689 690 691 692 693 694 695 696 697 698 699 700
    {
      asm volatile ("  movq       %0, %%mm2\n"
                    "\tmovq       %1, %%mm3\n"
                    "\tmovq    %%mm2, %%mm4\n"
                    "\t" pmaxub(mm3,mm4,mm5) "\n"
                    "\tmovq    %%mm0, %%mm1\n"
                    "\tpandn   %%mm4, %%mm1\n"
                    "\t" pminub(mm2,mm3,mm4) "\n"
                    "\tpand    %%mm0, %%mm3\n"
                    "\tpor     %%mm3, %%mm1\n"
                    "\tmovq    %%mm1, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
701

702
  if (op.n_pixels)
703
    {
704 705
    asm volatile ("  movd    %0, %%mm2\n"
                  "\tmovd    %1, %%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
706 707 708 709 710 711 712 713 714 715
                  "\tmovq    %%mm2, %%mm4\n"
                  "\t" pmaxub(mm3,mm4,mm5) "\n"

                  "\tmovq    %%mm0, %%mm1\n"
                  "\tpandn   %%mm4, %%mm1\n"

                  "\t" pminub(mm2,mm3,mm4) "\n"

                  "\tpand    %%mm0, %%mm3\n"
                  "\tpor     %%mm3, %%mm1\n"
716
                  "\tmovd    %%mm1, %2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
717
                  : /* empty */
718
                  : "m" (*op.A), "m" (*op.B), "m" (*op.D)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
719 720 721 722 723 724 725
                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
  }

  asm("emms");
}

void
726
gimp_composite_multiply_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
727 728 729
{
  GimpCompositeContext op = *_op;

730 731
  asm volatile ("movq    %0,%%mm0"     :  : "m" (*rgba8_alpha_mask) : "%mm0");
  asm volatile ("movq    %0,%%mm7"     :  : "m" (*rgba8_w128) : "%mm7");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
732
  asm volatile ("pxor    %%mm6,%%mm6"  :  :  : "%mm6");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
733

734
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
735 736 737
    {
      asm volatile ("  movq     %0, %%mm2\n"
                    "\tmovq     %1, %%mm3\n"
738

739 740 741
                    mmx_low_bytes_to_words(mm2,mm1,mm6)
                    mmx_low_bytes_to_words(mm3,mm5,mm6)
                    mmx_int_mult(mm5,mm1,mm7)
742

743 744 745
                    mmx_high_bytes_to_words(mm2,mm4,mm6)
                    mmx_high_bytes_to_words(mm3,mm5,mm6)
                    mmx_int_mult(mm5,mm4,mm7)
746

747
                    "\tpackuswb  %%mm4, %%mm1\n"
748

749 750 751 752 753 754
                    "\tmovq      %%mm0, %%mm4\n"
                    "\tpandn     %%mm1, %%mm4\n"
                    "\tmovq      %%mm4, %%mm1\n"
                    "\t" pminub(mm3,mm2,mm4) "\n"
                    "\tpand      %%mm0, %%mm2\n"
                    "\tpor       %%mm2, %%mm1\n"
755

756 757 758 759 760 761 762
                    "\tmovq    %%mm1, %2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
      op.A += 8;
      op.B += 8;
      op.D += 8;
Helvetix Victorinox's avatar
Helvetix Victorinox committed
763 764
  }

765
  if (op.n_pixels)
766
    {
767 768
    asm volatile ("  movd     %0, %%mm2\n"
                  "\tmovd     %1, %%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
769

770 771
                  mmx_low_bytes_to_words(mm2,mm1,mm6)
                  mmx_low_bytes_to_words(mm3,mm5,mm6)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
772
                  pmulwX(mm5,mm1,mm7)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
773

Helvetix Victorinox's avatar
Helvetix Victorinox committed
774
                  "\tpackuswb  %%mm6, %%mm1\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
775 776 777 778 779 780 781

                  "\tmovq      %%mm0, %%mm4\n"
                  "\tpandn     %%mm1, %%mm4\n"
                  "\tmovq      %%mm4, %%mm1\n"
                  "\t" pminub(mm3,mm2,mm4) "\n"
                  "\tpand      %%mm0, %%mm2\n"
                  "\tpor       %%mm2, %%mm1\n"
782

783
                  "\tmovd    %%mm1, %2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
784
                  : /* empty */
785
                  : "m" (*op.A), "m" (*op.B), "m" (*op.D)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
786 787 788 789 790 791
                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
  }

  asm("emms");
}

792
static void
Helvetix Victorinox's avatar
Helvetix Victorinox committed
793
mmx_op_overlay(void)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
794
{
795 796 797
  asm volatile (
                /* low bytes */
                mmx_low_bytes_to_words(mm3,mm5,mm0)
798 799 800 801 802
                "\tpcmpeqb   %%mm4,%%mm4\n"
                "\tpsubb     %%mm2,%%mm4\n" /* mm4 = 255 - A */
                "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */
                "\tmovq         %0,%%mm6\n"  /* mm6 = words of value 2 */
                "\tpmullw    %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */
803 804 805 806
                mmx_int_mult(mm6,mm4,mm7)    /* mm4 = INT_MULT(mm6, mm4) */

                /* high bytes */
                mmx_high_bytes_to_words(mm3,mm5,mm0)
807 808 809 810 811
                "\tpcmpeqb   %%mm1,%%mm1\n"
                "\tpsubb     %%mm2,%%mm1\n" /* mm1 = 255 - A */
                "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */
                "\tmovq         %0,%%mm6\n"  /* mm6 = words of value 2 */
                "\tpmullw    %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */
812 813 814 815 816 817 818 819 820 821 822 823 824 825 826 827
                mmx_int_mult(mm6,mm1,mm7)    /* mm1 = INT_MULT(mm6, mm1) */

                "\tpackuswb  %%mm1,%%mm4\n"  /* mm4 = intermediate value */

                mmx_low_bytes_to_words(mm4,mm5,mm0)
                mmx_low_bytes_to_words(mm2,mm6,mm0)
                "\tpaddw     %%mm6,%%mm5\n"
                mmx_int_mult(mm6,mm5,mm7)   /* mm5 = INT_MULT(mm6, mm5) low bytes */

                mmx_high_bytes_to_words(mm4,mm1,mm0)
                mmx_high_bytes_to_words(mm2,mm6,mm0)
                "\tpaddw     %%mm6,%%mm1\n"
                mmx_int_mult(mm6,mm1,mm7)   /* mm1 = INT_MULT(mm6, mm1) high bytes */

                "\tpackuswb  %%mm1,%%mm5\n"

828 829 830
                "\tmovq         %1,%%mm0\n"
                "\tmovq      %%mm0,%%mm1\n"
                "\tpandn     %%mm5,%%mm1\n"
831 832

                "\t" pminub(mm2,mm3,mm4) "\n"
833
                "\tpand      %%mm0,%%mm3\n"
834

835
                "\tpor       %%mm3,%%mm1\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
836

837 838 839
                : /* empty */
                : "m" (*rgba8_w2), "m" (*rgba8_alpha_mask)
                );
Helvetix Victorinox's avatar
Helvetix Victorinox committed
840 841 842
}

void
843
xxxgimp_composite_overlay_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
844 845 846
{
  GimpCompositeContext op = *_op;

Helvetix Victorinox's avatar
Helvetix Victorinox committed
847
  asm volatile ("pxor    %%mm0,%%mm0\n"
848
                "movq       %0,%%mm7"
849
                : /* empty */
850
                : "m" (*rgba8_w128) : "%mm0");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
851

852
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
853 854 855
    {
      asm volatile ("  movq         %0,%%mm2\n"
                    "\tmovq         %1,%%mm3\n"
856

857 858 859 860 861 862 863 864
                    /* low bytes */
                    mmx_low_bytes_to_words(mm3,mm5,mm0)
                    "\tpcmpeqb   %%mm4,%%mm4\n"
                    "\tpsubb     %%mm2,%%mm4\n" /* mm4 = 255 - A */
                    "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */
                    "\tmovq         %3,%%mm6\n"  /* mm6 = words of value 2 */
                    "\tpmullw    %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */
                    mmx_int_mult(mm6,mm4,mm7)    /* mm4 = INT_MULT(mm6, mm4) */
865

866 867 868 869 870 871 872 873
                    /* high bytes */
                    mmx_high_bytes_to_words(mm3,mm5,mm0)
                    "\tpcmpeqb   %%mm1,%%mm1\n"
                    "\tpsubb     %%mm2,%%mm1\n" /* mm1 = 255 - A */
                    "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */
                    "\tmovq         %3,%%mm6\n"  /* mm6 = words of value 2 */
                    "\tpmullw    %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */
                    mmx_int_mult(mm6,mm1,mm7)    /* mm1 = INT_MULT(mm6, mm1) */
874

875
                    "\tpackuswb  %%mm1,%%mm4\n"  /* mm4 = intermediate value */
876

877 878 879 880
                    mmx_low_bytes_to_words(mm4,mm5,mm0)
                    mmx_low_bytes_to_words(mm2,mm6,mm0)
                    "\tpaddw     %%mm6,%%mm5\n"
                    mmx_int_mult(mm6,mm5,mm7)   /* mm5 = INT_MULT(mm6, mm5) low bytes */
881

882 883 884 885
                    mmx_high_bytes_to_words(mm4,mm1,mm0)
                    mmx_high_bytes_to_words(mm2,mm6,mm0)
                    "\tpaddw     %%mm6,%%mm1\n"
                    mmx_int_mult(mm6,mm1,mm7)   /* mm1 = INT_MULT(mm6, mm1) high bytes */
886

887
                    "\tpackuswb  %%mm1,%%mm5\n"
888

889 890 891
                    "\tmovq         %4,%%mm0\n"
                    "\tmovq      %%mm0,%%mm1\n"
                    "\tpandn     %%mm5,%%mm1\n"
892

893 894
                    "\t" pminub(mm2,mm3,mm4) "\n"
                    "\tpand      %%mm0,%%mm3\n"
895

896
                    "\tpor       %%mm3,%%mm1\n"
897

898 899 900 901 902 903 904
                    "\tmovq      %%mm1,%2\n"
                    : "+m" (*op.A), "+m" (*op.B), "+m" (*op.D)
                    : "m" (*rgba8_w2), "m" (*rgba8_alpha_mask)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
      op.A += 8;
      op.B += 8;
      op.D += 8;
Helvetix Victorinox's avatar
Helvetix Victorinox committed
905 906
  }

907
  if (op.n_pixels)
908 909 910 911 912 913 914 915 916 917 918 919 920 921 922 923 924 925 926 927 928 929 930 931 932 933 934 935 936 937 938 939 940 941 942 943 944 945 946 947 948 949 950 951 952 953 954 955 956 957
    {
      asm volatile ("  movd         %0,%%mm2\n"
                    "\tmovd         %1,%%mm3\n"

                    /* low bytes */
                    mmx_low_bytes_to_words(mm3,mm5,mm0)
                    "\tpcmpeqb   %%mm4,%%mm4\n"
                    "\tpsubb     %%mm2,%%mm4\n" /* mm4 = 255 - A */
                    "\tpunpcklbw %%mm0,%%mm4\n" /* mm4 = (low bytes as word) mm4 */
                    "\tmovq         %3,%%mm6\n"  /* mm6 = words of value 2 */
                    "\tpmullw    %%mm5,%%mm6\n" /* mm6 = 2 * low bytes of B */
                    mmx_int_mult(mm6,mm4,mm7)    /* mm4 = INT_MULT(mm6, mm4) */

                    /* high bytes */
                    mmx_high_bytes_to_words(mm3,mm5,mm0)
                    "\tpcmpeqb   %%mm1,%%mm1\n"
                    "\tpsubb     %%mm2,%%mm1\n" /* mm1 = 255 - A */
                    "\tpunpckhbw %%mm0,%%mm1\n" /* mm1 = (high bytes as word) mm1 */
                    "\tmovq         %3,%%mm6\n"  /* mm6 = words of value 2 */
                    "\tpmullw    %%mm5,%%mm6\n" /* mm6 = 2 * high bytes of B */
                    mmx_int_mult(mm6,mm1,mm7)    /* mm1 = INT_MULT(mm6, mm1) */

                    "\tpackuswb  %%mm1,%%mm4\n"  /* mm4 = intermediate value */

                    mmx_low_bytes_to_words(mm4,mm5,mm0)
                    mmx_low_bytes_to_words(mm2,mm6,mm0)
                    "\tpaddw     %%mm6,%%mm5\n"
                    mmx_int_mult(mm6,mm5,mm7)   /* mm5 = INT_MULT(mm6, mm5) low bytes */

                    mmx_high_bytes_to_words(mm4,mm1,mm0)
                    mmx_high_bytes_to_words(mm2,mm6,mm0)
                    "\tpaddw     %%mm6,%%mm1\n"
                    mmx_int_mult(mm6,mm1,mm7)   /* mm1 = INT_MULT(mm6, mm1) high bytes */

                    "\tpackuswb  %%mm1,%%mm5\n"

                    "\tmovq         %4,%%mm0\n"
                    "\tmovq      %%mm0,%%mm1\n"
                    "\tpandn     %%mm5,%%mm1\n"

                    "\t" pminub(mm2,mm3,mm4) "\n"
                    "\tpand      %%mm0,%%mm3\n"

                    "\tpor       %%mm3,%%mm1\n"

                    "\tmovd      %%mm1,%2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D), "m" (*rgba8_w2), "m" (*rgba8_alpha_mask)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4");
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
958 959 960 961 962 963

  asm("emms");
}


void
964
gimp_composite_scale_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
965 966 967 968 969 970 971 972 973 974 975 976 977 978
{
  GimpCompositeContext op = *_op;

  asm volatile ("pxor    %%mm0,%%mm0\n"
                "\tmovl  %0,%%eax\n"
                "\tmovl  %%eax,%%ebx\n"
                "\tshl   $16,%%ebx\n"
                "\torl   %%ebx,%%eax\n"
                "\tmovd  %%eax,%%mm5\n"
                "\tmovd  %%eax,%%mm3\n"
                "\tpsllq $32,%%mm5\n"
                "\tpor   %%mm5,%%mm3\n"
                "\tmovq  %1,%%mm7\n"
                : /* empty */
979
                : "m" (op.scale.scale), "m" (*rgba8_w128)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
980 981
                : "%eax", "%mm0", "%mm5", "%mm6", "%mm7");

982
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
983
    {
Helvetix Victorinox's avatar
Helvetix Victorinox committed
984
      asm volatile ("movq           %1,%%mm2\n"
985 986 987 988 989 990 991 992 993 994 995 996 997
                    "\tmovq      %%mm2,%%mm1\n"
                    "\tpunpcklbw %%mm0,%%mm1\n"
                    "\tmovq      %%mm3,%%mm5\n"

                    "\t" pmulwX(mm5,mm1,mm7) "\n"

                    "\tmovq      %%mm2,%%mm4\n"
                    "\tpunpckhbw %%mm0,%%mm4\n"
                    "\tmovq      %%mm3,%%mm5\n"

                    "\t" pmulwX(mm5,mm4,mm7) "\n"

                    "\tpackuswb  %%mm4,%%mm1\n"
998

Helvetix Victorinox's avatar
Helvetix Victorinox committed
999 1000 1001 1002
                    "\tmovq    %%mm1,%0\n"
                    : "=m" (*op.D)
                    : "m" (*op.A)
                    : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
1003 1004 1005
      op.A += 8;
      op.D += 8;
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1006

1007
  if (op.n_pixels)
1008
    {
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1009 1010 1011 1012
      asm volatile ("movd           %1,%%mm2\n"
                    "\tmovq      %%mm2,%%mm1\n"
                    "\tpunpcklbw %%mm0,%%mm1\n"
                    "\tmovq      %%mm3,%%mm5\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1013

Helvetix Victorinox's avatar
Helvetix Victorinox committed
1014
                    "\t" pmulwX(mm5,mm1,mm7) "\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1015

Helvetix Victorinox's avatar
Helvetix Victorinox committed
1016 1017 1018 1019 1020
                    "\tpackuswb  %%mm0,%%mm1\n"
                    "\tmovd      %%mm1,%0\n"
                    : "=m" (*op.D)
                    : "m" (*op.A)
                    : "%mm1", "%mm2", "%mm3", "%mm4", "%mm5", "%mm6", "%mm7");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1021 1022 1023 1024 1025 1026
  }

  asm("emms");
}

void
1027
gimp_composite_screen_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1028 1029 1030
{
  GimpCompositeContext op = *_op;

1031 1032
  asm volatile ("movq    %0,%%mm0"     :  : "m" (*rgba8_alpha_mask) : "%mm0");
  asm volatile ("movq    %0,%%mm7"     :  : "m" (*rgba8_w128)  : "%mm7");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1033
  asm volatile ("pxor    %mm6, %mm6");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1034

1035
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
1036 1037 1038 1039 1040 1041 1042 1043 1044 1045 1046 1047 1048 1049 1050 1051 1052 1053 1054 1055 1056 1057
    {
      asm volatile ("  movq         %0,%%mm2\n"
                    "\tmovq         %1,%%mm3\n"

                    "\tpcmpeqb   %%mm4,%%mm4\n"
                    "\tpsubb     %%mm2,%%mm4\n"
                    "\tpcmpeqb   %%mm5,%%mm5\n"
                    "\tpsubb     %%mm3,%%mm5\n"

                    "\tpunpcklbw %%mm6,%%mm4\n"
                    "\tpunpcklbw %%mm6,%%mm5\n"
                    "\tpmullw    %%mm4,%%mm5\n"
                    "\tpaddw     %%mm7,%%mm5\n"
                    "\tmovq      %%mm5,%%mm1\n"
                    "\tpsrlw       $ 8,%%mm1\n"
                    "\tpaddw     %%mm5,%%mm1\n"
                    "\tpsrlw       $ 8,%%mm1\n"

                    "\tpcmpeqb   %%mm4,%%mm4\n"
                    "\tpsubb     %%mm2,%%mm4\n"
                    "\tpcmpeqb   %%mm5,%%mm5\n"
                    "\tpsubb     %%mm3,%%mm5\n"
1058

1059 1060 1061 1062 1063 1064 1065 1066 1067 1068 1069 1070 1071 1072 1073 1074
                    "\tpunpckhbw %%mm6,%%mm4\n"
                    "\tpunpckhbw %%mm6,%%mm5\n"
                    "\tpmullw    %%mm4,%%mm5\n"
                    "\tpaddw     %%mm7,%%mm5\n"
                    "\tmovq      %%mm5,%%mm4\n"
                    "\tpsrlw       $ 8,%%mm4\n"
                    "\tpaddw     %%mm5,%%mm4\n"
                    "\tpsrlw       $ 8,%%mm4\n"

                    "\tpackuswb  %%mm4,%%mm1\n"

                    "\tpcmpeqb   %%mm4,%%mm4\n"
                    "\tpsubb     %%mm1,%%mm4\n"

                    "\tmovq      %%mm0,%%mm1\n"
                    "\tpandn     %%mm4,%%mm1\n"
1075

1076 1077 1078 1079
                    "\t" pminub(mm2,mm3,mm4) "\n"
                    "\tpand      %%mm0,%%mm3\n"

                    "\tpor       %%mm3,%%mm1\n"
1080

1081 1082 1083 1084 1085 1086 1087
                    "\tmovq      %%mm1,%2\n"
                    : /* empty */
                    : "m" (*op.A), "m" (*op.B), "m" (*op.D)
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
      op.A += 8;
      op.B += 8;
      op.D += 8;
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1088 1089
  }

1090
  if (op.n_pixels)
1091
    {
1092 1093 1094 1095 1096 1097 1098 1099 1100 1101 1102 1103 1104 1105 1106 1107 1108 1109 1110 1111 1112 1113 1114 1115 1116 1117 1118 1119 1120 1121 1122 1123 1124 1125 1126 1127 1128 1129
    asm volatile ("  movd         %0,%%mm2\n"
                  "\tmovd         %1,%%mm3\n"

                  "\tpcmpeqb   %%mm4,%%mm4\n"
                  "\tpsubb     %%mm2,%%mm4\n"
                  "\tpcmpeqb   %%mm5,%%mm5\n"
                  "\tpsubb     %%mm3,%%mm5\n"

                  "\tpunpcklbw %%mm6,%%mm4\n"
                  "\tpunpcklbw %%mm6,%%mm5\n"
                  "\tpmullw    %%mm4,%%mm5\n"
                  "\tpaddw     %%mm7,%%mm5\n"
                  "\tmovq      %%mm5,%%mm1\n"
                  "\tpsrlw       $ 8,%%mm1\n"
                  "\tpaddw     %%mm5,%%mm1\n"
                  "\tpsrlw       $ 8,%%mm1\n"

                  "\tpcmpeqb   %%mm4,%%mm4\n"
                  "\tpsubb     %%mm2,%%mm4\n"
                  "\tpcmpeqb   %%mm5,%%mm5\n"
                  "\tpsubb     %%mm3,%%mm5\n"

                  "\tpunpckhbw %%mm6,%%mm4\n"
                  "\tpunpckhbw %%mm6,%%mm5\n"
                  "\tpmullw    %%mm4,%%mm5\n"
                  "\tpaddw     %%mm7,%%mm5\n"
                  "\tmovq      %%mm5,%%mm4\n"
                  "\tpsrlw       $ 8,%%mm4\n"
                  "\tpaddw     %%mm5,%%mm4\n"
                  "\tpsrlw       $ 8,%%mm4\n"

                  "\tpackuswb  %%mm4,%%mm1\n"

                  "\tpcmpeqb   %%mm4,%%mm4\n"
                  "\tpsubb     %%mm1,%%mm4\n"

                  "\tmovq      %%mm0,%%mm1\n"
                  "\tpandn     %%mm4,%%mm1\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1130 1131

                  "\t" pminub(mm2,mm3,mm4) "\n"
1132
                  "\tpand      %%mm0,%%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1133

1134
                  "\tpor       %%mm3,%%mm1\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1135

1136
                  "\tmovd      %%mm1,%2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1137
                  : /* empty */
1138
                  : "m" (*op.A), "m" (*op.B), "m" (*op.D)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1139 1140 1141 1142 1143 1144 1145 1146
                  : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
  }

  asm("emms");
}


void
1147
gimp_composite_subtract_rgba8_rgba8_rgba8_mmx (GimpCompositeContext *_op)
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1148 1149 1150
{
  GimpCompositeContext op = *_op;

1151
  asm volatile ("movq    %0,%%mm0"     :  : "m" (*rgba8_alpha_mask) : "%mm0");
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1152

1153
  for (; op.n_pixels >= 2; op.n_pixels -= 2)
1154
    {
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1155 1156
      asm volatile ("  movq       %1,%%mm2\n"
                    "\tmovq       %2,%%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1157

1158 1159
                    "\tmovq    %%mm2,%%mm4\n"
                    "\tpsubusb %%mm3,%%mm4\n"
1160

1161 1162
                    "\tmovq    %%mm0,%%mm1\n"
                    "\tpandn   %%mm4,%%mm1\n"
1163

1164
                    "\t" pminub(mm3,mm2,mm4) "\n"
1165

1166 1167
                    "\tpand    %%mm0,%%mm2\n"
                    "\tpor     %%mm2,%%mm1\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1168 1169 1170
                    "\tmovq    %%mm1,%0\n"
                    : "=m" (*op.D)
                    : "m" (*op.A), "m" (*op.B)
1171 1172 1173 1174 1175
                    : "0", "1", "2", "%mm1", "%mm2", "%mm3", "%mm4", "%mm5");
      op.A += 8;
      op.B += 8;
      op.D += 8;
    }
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1176

1177
  if (op.n_pixels)
1178
    {
1179 1180
    asm volatile ("  movd       %0,%%mm2\n"
                  "\tmovd       %1,%%mm3\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1181

1182 1183
                  "\tmovq    %%mm2,%%mm4\n"
                  "\tpsubusb %%mm3,%%mm4\n"
1184

1185 1186
                  "\tmovq    %%mm0,%%mm1\n"
                  "\tpandn   %%mm4,%%mm1\n"
1187

Helvetix Victorinox's avatar
Helvetix Victorinox committed
1188 1189
                  "\t" pminub(mm3,mm2,mm4) "\n"

1190 1191 1192
                  "\tpand    %%mm0,%%mm2\n"
                  "\tpor     %%mm2,%%mm1\n"
                  "\tmovd    %%mm1,%2\n"
Helvetix Victorinox's avatar
Helvetix Victorinox committed
1193
                  : /* empty */
1194
                  : "m" (*op.A), "m" (*op.B), "m" (*op.D)