[Haiku-commits] r31165 - in haiku/trunk/src/servers/app: . drawing/Painter
stippi at BerliOS
stippi at mail.berlios.de
Mon Jun 22 00:07:56 CEST 2009
Author: stippi
Date: 2009-06-22 00:07:54 +0200 (Mon, 22 Jun 2009)
New Revision: 31165
ViewCVS: http://svn.berlios.de/viewcvs/haiku?rev=31165&view=rev
Added:
haiku/trunk/src/servers/app/drawing/Painter/painter_bilinear_scale.nasm
Modified:
haiku/trunk/src/servers/app/AppServer.cpp
haiku/trunk/src/servers/app/AppServer.h
haiku/trunk/src/servers/app/drawing/Painter/Jamfile
haiku/trunk/src/servers/app/drawing/Painter/Painter.cpp
haiku/trunk/src/servers/app/drawing/Painter/Painter.h
Log:
Patch by Christian Packmann:
* Implemented a CPU feature detection function in AppServer.cpp.
The results are put into the global variable gAppServerSIMDFlags.
* Implemented an SIMD accelerated version of the bilinear bitmap
scaling code that is the backend of BView::DrawBitmap(...,
uint32 options) used by the MediaPlayer to smoothly upscale
movies when no video overlay is available. The speed up is very
noticable and a Core 2 Duo @ 1.8 GHz can play at 1920x1200 now
without breaking a sweat. There is currently one SIMD version
implemented which uses MMX and plain SSE.
Very cool! Thanks a lot!
Modified: haiku/trunk/src/servers/app/AppServer.cpp
===================================================================
--- haiku/trunk/src/servers/app/AppServer.cpp 2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/AppServer.cpp 2009-06-21 22:07:54 UTC (rev 31165)
@@ -6,6 +6,7 @@
* DarkWyrm <bpmagic at columbus.rr.com>
* Axel Dörfler, axeld at pinc-software.de
* Stephan Aßmus <superstippi at gmx.de>
+ * Christian Packmann
*/
@@ -35,16 +36,77 @@
port_id gAppServerPort;
static AppServer *sAppServer;
BTokenSpace gTokenSpace;
+uint32 gAppServerSIMDFlags = 0;
+/*! Detect SIMD flags for use in AppServer. Checks all CPUs in the system
+ and chooses the minimum supported set of instructions. */
+static void
+detect_simd()
+{
+ // Only scan CPUs for which we are certain the SIMD flags are properly
+ // defined.
+ char* vendorNames[] = {
+ "GenuineIntel",
+ "AuthenticAMD",
+ "CentaurHauls", // Via CPUs, MMX and SSE support
+ "RiseRiseRise", // should be MMX-only
+ "CyrixInstead", // MMX-only, but custom MMX extensions
+ "GenuineTMx86", // MMX and SSE
+ 0
+ };
+
+ system_info sysInfo;
+ if (get_system_info(&sysInfo) != B_OK || sysInfo.cpu_count < 1)
+ return;
+
+ // We start out with all flags set and end up with only those flags
+ // supported across all CPUs found.
+ uint32 appServerSIMD = 0xffffffff;
+
+ for (int32 cpu = 0; cpu < sysInfo.cpu_count; cpu++) {
+ cpuid_info cpuInfo;
+ get_cpuid(&cpuInfo, 0, cpu);
+
+ // Get the vendor string and terminate it manually
+ char vendor[13];
+ memcpy(vendor, cpuInfo.eax_0.vendor_id, 12);
+ vendor[12] = 0;
+
+ bool vendorFound = false;
+ for (uint32 i = 0; vendorNames[i] != 0; i++) {
+ if (strcmp(vendor, vendorNames[i]) == 0)
+ vendorFound = true;
+ }
+
+ uint32 cpuSIMD = 0;
+ uint32 maxStdFunc = cpuInfo.regs.eax;
+ if (vendorFound && maxStdFunc >= 1) {
+ get_cpuid(&cpuInfo, 1, 0);
+ uint32 edx = cpuInfo.regs.edx;
+ if (edx & (1 << 23))
+ cpuSIMD |= APPSERVER_SIMD_MMX;
+ if (edx & (1 << 25))
+ cpuSIMD |= APPSERVER_SIMD_SSE;
+ } else {
+ // no flags can be identified
+ cpuSIMD = 0;
+ }
+ appServerSIMD &= cpuSIMD;
+ }
+ gAppServerSIMDFlags = appServerSIMD;
+}
+
+
/*!
\brief Constructor
-
+
This loads the default fonts, allocates all the major global variables, spawns the main housekeeping
threads, loads user preferences for the UI and decorator, and allocates various locks.
*/
AppServer::AppServer()
- : MessageLooper("app_server"),
+ :
+ MessageLooper("app_server"),
fMessagePort(-1),
fDesktops(),
fDesktopLock("AppServerDesktopLock")
@@ -70,10 +132,12 @@
gScreenManager = new ScreenManager();
gScreenManager->Run();
-
+
// Create the bitmap allocator. Object declared in BitmapManager.cpp
gBitmapManager = new BitmapManager();
+ // Initialize SIMD flags
+ detect_simd();
#if 0
_LaunchCursorThread();
#endif
@@ -148,7 +212,7 @@
AppServer::_FindDesktop(uid_t userID)
{
BAutolock locker(fDesktopLock);
-
+
for (int32 i = 0; i < fDesktops.CountItems(); i++) {
Desktop* desktop = fDesktops.ItemAt(i);
@@ -164,7 +228,7 @@
\brief Message handling function for all messages sent to the app_server
\param code ID of the message sent
\param buffer Attachment buffer for the message.
-
+
*/
void
AppServer::_DispatchMessage(int32 code, BPrivate::LinkReceiver& msg)
Modified: haiku/trunk/src/servers/app/AppServer.h
===================================================================
--- haiku/trunk/src/servers/app/AppServer.h 2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/AppServer.h 2009-06-21 22:07:54 UTC (rev 31165)
@@ -56,5 +56,10 @@
extern BitmapManager *gBitmapManager;
extern port_id gAppServerPort;
+extern uint32 gAppServerSIMDFlags;
+// Defines for SIMD support. Early implementation, subject to change
+#define APPSERVER_SIMD_MMX (1 << 0)
+#define APPSERVER_SIMD_SSE (1 << 1)
+
#endif /* APP_SERVER_H */
Modified: haiku/trunk/src/servers/app/drawing/Painter/Jamfile
===================================================================
--- haiku/trunk/src/servers/app/drawing/Painter/Jamfile 2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/drawing/Painter/Jamfile 2009-06-21 22:07:54 UTC (rev 31165)
@@ -21,4 +21,6 @@
PixelFormat.cpp
AGGTextRenderer.cpp
+
+ painter_bilinear_scale.nasm
;
Modified: haiku/trunk/src/servers/app/drawing/Painter/Painter.cpp
===================================================================
--- haiku/trunk/src/servers/app/drawing/Painter/Painter.cpp 2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/drawing/Painter/Painter.cpp 2009-06-21 22:07:54 UTC (rev 31165)
@@ -1,12 +1,13 @@
/*
- * Copyright 2005-2007, Stephan Aßmus <superstippi at gmx.de>.
+ * Copyright 2009, Christian Packmann.
* Copyright 2008, Andrej Spielmann <andrej.spielmann at seh.ox.ac.uk>.
+ * Copyright 2005-2009, Stephan Aßmus <superstippi at gmx.de>.
* All rights reserved. Distributed under the terms of the MIT License.
- *
- * API to the Anti-Grain Geometry based "Painter" drawing backend. Manages
- * rendering pipe-lines for stroke, fills, bitmap and text rendering.
*/
+/*! API to the Anti-Grain Geometry based "Painter" drawing backend. Manages
+ rendering pipe-lines for stroke, fills, bitmap and text rendering.*/
+
#include <new>
#include <stdio.h>
#include <string.h>
@@ -54,6 +55,8 @@
#include "Painter.h"
+#include "AppServer.h"
+
using std::nothrow;
#undef TRACE
@@ -78,41 +81,43 @@
// constructor
Painter::Painter()
- : fBuffer(),
- fPixelFormat(fBuffer, &fPatternHandler),
- fBaseRenderer(fPixelFormat),
- fUnpackedScanline(),
- fPackedScanline(),
- fSubpixPackedScanline(),
- fSubpixUnpackedScanline(),
- fSubpixRasterizer(),
- fRasterizer(),
- fSubpixRenderer(fBaseRenderer),
- fRenderer(fBaseRenderer),
- fRendererBin(fBaseRenderer),
+ :
+ fBuffer(),
+ fPixelFormat(fBuffer, &fPatternHandler),
+ fBaseRenderer(fPixelFormat),
+ fUnpackedScanline(),
+ fPackedScanline(),
+ fSubpixPackedScanline(),
+ fSubpixUnpackedScanline(),
+ fSubpixRasterizer(),
+ fRasterizer(),
+ fSubpixRenderer(fBaseRenderer),
+ fRenderer(fBaseRenderer),
+ fRendererBin(fBaseRenderer),
- fPath(),
- fCurve(fPath),
+ fPath(),
+ fCurve(fPath),
- fSubpixelPrecise(false),
- fValidClipping(false),
- fDrawingText(false),
- fAttached(false),
+ fSubpixelPrecise(false),
+ fValidClipping(false),
+ fDrawingText(false),
+ fAttached(false),
- fPenSize(1.0),
- fClippingRegion(NULL),
- fDrawingMode(B_OP_COPY),
- fAlphaSrcMode(B_PIXEL_ALPHA),
- fAlphaFncMode(B_ALPHA_OVERLAY),
- fLineCapMode(B_BUTT_CAP),
- fLineJoinMode(B_MITER_JOIN),
- fMiterLimit(B_DEFAULT_MITER_LIMIT),
+ fPenSize(1.0),
+ fClippingRegion(NULL),
+ fDrawingMode(B_OP_COPY),
+ fAlphaSrcMode(B_PIXEL_ALPHA),
+ fAlphaFncMode(B_ALPHA_OVERLAY),
+ fLineCapMode(B_BUTT_CAP),
+ fLineJoinMode(B_MITER_JOIN),
+ fMiterLimit(B_DEFAULT_MITER_LIMIT),
- fPatternHandler(),
- fTextRenderer(fSubpixRenderer, fRenderer, fRendererBin, fUnpackedScanline,
+ fPatternHandler(),
+ fTextRenderer(fSubpixRenderer, fRenderer, fRendererBin, fUnpackedScanline,
fSubpixUnpackedScanline, fSubpixRasterizer)
{
- fPixelFormat.SetDrawingMode(fDrawingMode, fAlphaSrcMode, fAlphaFncMode, false);
+ fPixelFormat.SetDrawingMode(fDrawingMode, fAlphaSrcMode, fAlphaFncMode,
+ false);
#if ALIASED_DRAWING
fRasterizer.gamma(agg::gamma_threshold(0.5));
@@ -131,8 +136,9 @@
void
Painter::AttachToBuffer(RenderingBuffer* buffer)
{
- if (buffer && buffer->InitCheck() >= B_OK &&
- (buffer->ColorSpace() == B_RGBA32 || buffer->ColorSpace() == B_RGB32)) {
+ if (buffer && buffer->InitCheck() >= B_OK
+ && (buffer->ColorSpace() == B_RGBA32
+ || buffer->ColorSpace() == B_RGB32)) {
// TODO: implement drawing on B_RGB24, B_RGB15, B_RGB16,
// B_CMAP8 and B_GRAY8 :-[
// (if ever we want to support some devices where this gives
@@ -2207,9 +2213,25 @@
const uint32 dstBPR = fBuffer.stride();
const uint32 srcBPR = srcBuffer.stride();
- bool optimizeForLowFilterRatio = xScale == yScale
- && (xScale == 1.5 || xScale == 2.0 || xScale == 2.5 || xScale == 3.0);
+ // Figure out which version of the code we want to use...
+ enum {
+ kOptimizeForLowFilterRatio = 0,
+ kUsePlainCVersion,
+ kUseSIMDVersion
+ };
+ int codeSelect = kUsePlainCVersion;
+
+ uint32 neededSIMDFlags = (APPSERVER_SIMD_MMX | APPSERVER_SIMD_SSE);
+ if ((gAppServerSIMDFlags & neededSIMDFlags) == neededSIMDFlags)
+ codeSelect = kUseSIMDVersion;
+ else {
+ if (xScale == yScale && (xScale == 1.5 || xScale == 2.0
+ || xScale == 2.5 || xScale == 3.0)) {
+ codeSelect = kOptimizeForLowFilterRatio;
+ }
+ }
+
// iterate over clipping boxes
fBaseRenderer.first_clip_box();
do {
@@ -2236,161 +2258,248 @@
//printf("x: %ld - %ld\n", xIndexL, xIndexR);
//printf("y: %ld - %ld\n", y1, y2);
- if (optimizeForLowFilterRatio) {
- // In this mode, we anticipate to hit many destination pixels that
- // map directly to a source pixel, we have more branches in the
- // inner loop but save time because of the special cases. If there
- // are too few direct hit pixels, the branches only waste time.
- for (; y1 <= y2; y1++) {
- // cache the weight of the top and bottom row
- const uint16 wTop = yWeights[y1].weight;
- const uint16 wBottom = 255 - yWeights[y1].weight;
+ switch (codeSelect) {
+ case kOptimizeForLowFilterRatio:
+ {
+ // In this mode, we anticipate to hit many destination pixels
+ // that map directly to a source pixel, we have more branches
+ // in the inner loop but save time because of the special
+ // cases. If there are too few direct hit pixels, the branches
+ // only waste time.
+ for (; y1 <= y2; y1++) {
+ // cache the weight of the top and bottom row
+ const uint16 wTop = yWeights[y1].weight;
+ const uint16 wBottom = 255 - yWeights[y1].weight;
- // buffer offset into source (top row)
- register const uint8* src
- = srcBuffer.row_ptr(yWeights[y1].index);
- // buffer handle for destination to be incremented per pixel
- register uint8* d = dst;
+ // buffer offset into source (top row)
+ register const uint8* src
+ = srcBuffer.row_ptr(yWeights[y1].index);
+ // buffer handle for destination to be incremented per
+ // pixel
+ register uint8* d = dst;
- if (wTop == 255) {
- for (int32 x = xIndexL; x <= xIndexR; x++) {
- const uint8* s = src + xWeights[x].index;
- // This case is important to prevent out
- // of bounds access at bottom edge of the source
- // bitmap. If the scale is low and integer, it will
- // also help the speed.
- if (xWeights[x].weight == 255) {
- // As above, but to prevent out of bounds
- // on the right edge.
- *(uint32*)d = *(uint32*)s;
- } else {
- // Only the left and right pixels are interpolated,
- // since the top row has 100% weight.
- const uint16 wLeft = xWeights[x].weight;
- const uint16 wRight = 255 - wLeft;
- d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
- d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
- d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
+ if (wTop == 255) {
+ for (int32 x = xIndexL; x <= xIndexR; x++) {
+ const uint8* s = src + xWeights[x].index;
+ // This case is important to prevent out
+ // of bounds access at bottom edge of the source
+ // bitmap. If the scale is low and integer, it will
+ // also help the speed.
+ if (xWeights[x].weight == 255) {
+ // As above, but to prevent out of bounds
+ // on the right edge.
+ *(uint32*)d = *(uint32*)s;
+ } else {
+ // Only the left and right pixels are
+ // interpolated, since the top row has 100%
+ // weight.
+ const uint16 wLeft = xWeights[x].weight;
+ const uint16 wRight = 255 - wLeft;
+ d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
+ d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
+ d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
+ }
+ d += 4;
}
- d += 4;
- }
- } else {
- for (int32 x = xIndexL; x <= xIndexR; x++) {
- const uint8* s = src + xWeights[x].index;
- if (xWeights[x].weight == 255) {
- // Prevent out of bounds access on the right edge
- // or simply speed up.
- const uint8* sBottom = s + srcBPR;
- d[0] = (s[0] * wTop + sBottom[0] * wBottom) >> 8;
- d[1] = (s[1] * wTop + sBottom[1] * wBottom) >> 8;
- d[2] = (s[2] * wTop + sBottom[2] * wBottom) >> 8;
- } else {
- // calculate the weighted sum of all four
- // interpolated pixels
- const uint16 wLeft = xWeights[x].weight;
- const uint16 wRight = 255 - wLeft;
- // left and right of top row
- uint32 t0 = (s[0] * wLeft + s[4] * wRight) * wTop;
- uint32 t1 = (s[1] * wLeft + s[5] * wRight) * wTop;
- uint32 t2 = (s[2] * wLeft + s[6] * wRight) * wTop;
+ } else {
+ for (int32 x = xIndexL; x <= xIndexR; x++) {
+ const uint8* s = src + xWeights[x].index;
+ if (xWeights[x].weight == 255) {
+ // Prevent out of bounds access on the right
+ // edge or simply speed up.
+ const uint8* sBottom = s + srcBPR;
+ d[0] = (s[0] * wTop + sBottom[0] * wBottom)
+ >> 8;
+ d[1] = (s[1] * wTop + sBottom[1] * wBottom)
+ >> 8;
+ d[2] = (s[2] * wTop + sBottom[2] * wBottom)
+ >> 8;
+ } else {
+ // calculate the weighted sum of all four
+ // interpolated pixels
+ const uint16 wLeft = xWeights[x].weight;
+ const uint16 wRight = 255 - wLeft;
+ // left and right of top row
+ uint32 t0 = (s[0] * wLeft + s[4] * wRight)
+ * wTop;
+ uint32 t1 = (s[1] * wLeft + s[5] * wRight)
+ * wTop;
+ uint32 t2 = (s[2] * wLeft + s[6] * wRight)
+ * wTop;
- // left and right of bottom row
- s += srcBPR;
- t0 += (s[0] * wLeft + s[4] * wRight) * wBottom;
- t1 += (s[1] * wLeft + s[5] * wRight) * wBottom;
- t2 += (s[2] * wLeft + s[6] * wRight) * wBottom;
+ // left and right of bottom row
+ s += srcBPR;
+ t0 += (s[0] * wLeft + s[4] * wRight) * wBottom;
+ t1 += (s[1] * wLeft + s[5] * wRight) * wBottom;
+ t2 += (s[2] * wLeft + s[6] * wRight) * wBottom;
- d[0] = t0 >> 16;
- d[1] = t1 >> 16;
- d[2] = t2 >> 16;
+ d[0] = t0 >> 16;
+ d[1] = t1 >> 16;
+ d[2] = t2 >> 16;
+ }
+ d += 4;
}
- d += 4;
}
+ dst += dstBPR;
}
- dst += dstBPR;
+ break;
}
- } else {
- // In this mode we anticipate many pixels wich need filtering,
- // there are no special cases for direct hit pixels except for the
- // last column/row and the right/bottom corner pixel.
- // The last column/row handling does not need to be performed
- // for all clipping rects!
- int32 yMax = y2;
- if (yWeights[yMax].weight == 255)
- yMax--;
- int32 xIndexMax = xIndexR;
- if (xWeights[xIndexMax].weight == 255)
- xIndexMax--;
+ case kUsePlainCVersion:
+ {
+ // In this mode we anticipate many pixels wich need filtering,
+ // there are no special cases for direct hit pixels except for
+ // the last column/row and the right/bottom corner pixel.
- for (; y1 <= yMax; y1++) {
- // cache the weight of the top and bottom row
- const uint16 wTop = yWeights[y1].weight;
- const uint16 wBottom = 255 - yWeights[y1].weight;
+ // The last column/row handling does not need to be performed
+ // for all clipping rects!
+ int32 yMax = y2;
+ if (yWeights[yMax].weight == 255)
+ yMax--;
+ int32 xIndexMax = xIndexR;
+ if (xWeights[xIndexMax].weight == 255)
+ xIndexMax--;
- // buffer offset into source (top row)
+ for (; y1 <= yMax; y1++) {
+ // cache the weight of the top and bottom row
+ const uint16 wTop = yWeights[y1].weight;
+ const uint16 wBottom = 255 - yWeights[y1].weight;
+
+ // buffer offset into source (top row)
+ register const uint8* src
+ = srcBuffer.row_ptr(yWeights[y1].index);
+ // buffer handle for destination to be incremented per
+ // pixel
+ register uint8* d = dst;
+
+ for (int32 x = xIndexL; x <= xIndexMax; x++) {
+ const uint8* s = src + xWeights[x].index;
+ // calculate the weighted sum of all four
+ // interpolated pixels
+ const uint16 wLeft = xWeights[x].weight;
+ const uint16 wRight = 255 - wLeft;
+ // left and right of top row
+ uint32 t0 = (s[0] * wLeft + s[4] * wRight) * wTop;
+ uint32 t1 = (s[1] * wLeft + s[5] * wRight) * wTop;
+ uint32 t2 = (s[2] * wLeft + s[6] * wRight) * wTop;
+
+ // left and right of bottom row
+ s += srcBPR;
+ t0 += (s[0] * wLeft + s[4] * wRight) * wBottom;
+ t1 += (s[1] * wLeft + s[5] * wRight) * wBottom;
+ t2 += (s[2] * wLeft + s[6] * wRight) * wBottom;
+ d[0] = t0 >> 16;
+ d[1] = t1 >> 16;
+ d[2] = t2 >> 16;
+ d += 4;
+ }
+ // last column of pixels if necessary
+ if (xIndexMax < xIndexR) {
+ const uint8* s = src + xWeights[xIndexR].index;
+ const uint8* sBottom = s + srcBPR;
+ d[0] = (s[0] * wTop + sBottom[0] * wBottom) >> 8;
+ d[1] = (s[1] * wTop + sBottom[1] * wBottom) >> 8;
+ d[2] = (s[2] * wTop + sBottom[2] * wBottom) >> 8;
+ }
+
+ dst += dstBPR;
+ }
+
+ // last row of pixels if necessary
+ // buffer offset into source (bottom row)
register const uint8* src
- = srcBuffer.row_ptr(yWeights[y1].index);
+ = srcBuffer.row_ptr(yWeights[y2].index);
// buffer handle for destination to be incremented per pixel
register uint8* d = dst;
- for (int32 x = xIndexL; x <= xIndexMax; x++) {
- const uint8* s = src + xWeights[x].index;
- // calculate the weighted sum of all four
- // interpolated pixels
- const uint16 wLeft = xWeights[x].weight;
- const uint16 wRight = 255 - wLeft;
- // left and right of top row
- uint32 t0 = (s[0] * wLeft + s[4] * wRight) * wTop;
- uint32 t1 = (s[1] * wLeft + s[5] * wRight) * wTop;
- uint32 t2 = (s[2] * wLeft + s[6] * wRight) * wTop;
+ if (yMax < y2) {
+ for (int32 x = xIndexL; x <= xIndexMax; x++) {
+ const uint8* s = src + xWeights[x].index;
+ const uint16 wLeft = xWeights[x].weight;
+ const uint16 wRight = 255 - wLeft;
+ d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
+ d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
+ d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
+ d += 4;
+ }
+ }
- // left and right of bottom row
- s += srcBPR;
- t0 += (s[0] * wLeft + s[4] * wRight) * wBottom;
- t1 += (s[1] * wLeft + s[5] * wRight) * wBottom;
- t2 += (s[2] * wLeft + s[6] * wRight) * wBottom;
- d[0] = t0 >> 16;
- d[1] = t1 >> 16;
- d[2] = t2 >> 16;
- d += 4;
- }
- // last column of pixels if necessary
- if (xIndexMax < xIndexR) {
+ // pixel in bottom right corner if necessary
+ if (yMax < y2 && xIndexMax < xIndexR) {
const uint8* s = src + xWeights[xIndexR].index;
- const uint8* sBottom = s + srcBPR;
- d[0] = (s[0] * wTop + sBottom[0] * wBottom) >> 8;
- d[1] = (s[1] * wTop + sBottom[1] * wBottom) >> 8;
- d[2] = (s[2] * wTop + sBottom[2] * wBottom) >> 8;
+ *(uint32*)d = *(uint32*)s;
}
-
- dst += dstBPR;
+ break;
}
- // last row of pixels if necessary
- // buffer offset into source (bottom row)
- register const uint8* src = srcBuffer.row_ptr(yWeights[y2].index);
- // buffer handle for destination to be incremented per pixel
- register uint8* d = dst;
+ case kUseSIMDVersion:
+ {
+ // Basically the same as the "standard" mode, but we use SIMD
+ // routines for the processing of the single display lines.
- if (yMax < y2) {
- for (int32 x = xIndexL; x <= xIndexMax; x++) {
- const uint8* s = src + xWeights[x].index;
- const uint16 wLeft = xWeights[x].weight;
- const uint16 wRight = 255 - wLeft;
- d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
- d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
- d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
- d += 4;
+ // The last column/row handling does not need to be performed
+ // for all clipping rects!
+ int32 yMax = y2;
+ if (yWeights[yMax].weight == 255)
+ yMax--;
+ int32 xIndexMax = xIndexR;
+ if (xWeights[xIndexMax].weight == 255)
+ xIndexMax--;
+
+ for (; y1 <= yMax; y1++) {
+ // cache the weight of the top and bottom row
+ const uint16 wTop = yWeights[y1].weight;
+ const uint16 wBottom = 255 - yWeights[y1].weight;
+
+ // buffer offset into source (top row)
+ const uint8* src = srcBuffer.row_ptr(yWeights[y1].index);
+ // buffer handle for destination to be incremented per
+ // pixel
+ uint8* d = dst;
+ bilinear_scale_xloop_mmxsse(src, dst, xWeights, xIndexL,
+ xIndexMax, wTop, srcBPR);
+ // increase pointer by processed pixels
+ d += (xIndexMax - xIndexL + 1) * 4;
+
+ // last column of pixels if necessary
+ if (xIndexMax < xIndexR) {
+ const uint8* s = src + xWeights[xIndexR].index;
+ const uint8* sBottom = s + srcBPR;
+ d[0] = (s[0] * wTop + sBottom[0] * wBottom) >> 8;
+ d[1] = (s[1] * wTop + sBottom[1] * wBottom) >> 8;
+ d[2] = (s[2] * wTop + sBottom[2] * wBottom) >> 8;
+ }
+
+ dst += dstBPR;
}
- }
- // pixel in bottom right corner if necessary
- if (yMax < y2 && xIndexMax < xIndexR) {
- const uint8* s = src + xWeights[xIndexR].index;
- *(uint32*)d = *(uint32*)s;
+ // last row of pixels if necessary
+ // buffer offset into source (bottom row)
+ register const uint8* src
+ = srcBuffer.row_ptr(yWeights[y2].index);
+ // buffer handle for destination to be incremented per pixel
+ register uint8* d = dst;
+
+ if (yMax < y2) {
+ for (int32 x = xIndexL; x <= xIndexMax; x++) {
+ const uint8* s = src + xWeights[x].index;
+ const uint16 wLeft = xWeights[x].weight;
+ const uint16 wRight = 255 - wLeft;
+ d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
+ d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
+ d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
+ d += 4;
+ }
+ }
+
+ // pixel in bottom right corner if necessary
+ if (yMax < y2 && xIndexMax < xIndexR) {
+ const uint8* s = src + xWeights[xIndexR].index;
+ *(uint32*)d = *(uint32*)s;
+ }
+ break;
}
- }
+ } // switch(codeselect)
} while (fBaseRenderer.next_clip_box());
#ifdef FILTER_INFOS_ON_HEAP
Modified: haiku/trunk/src/servers/app/drawing/Painter/Painter.h
===================================================================
--- haiku/trunk/src/servers/app/drawing/Painter/Painter.h 2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/drawing/Painter/Painter.h 2009-06-21 22:07:54 UTC (rev 31165)
@@ -24,6 +24,15 @@
#include <Font.h>
#include <Rect.h>
+
+// Prototypes for assembler routines
+extern "C" {
+ void bilinear_scale_xloop_mmxsse(const uint8* src, void* dst, void* xWeights,
+ uint32 xmin, uint32 xmax, uint32 wTop, uint32 srcBPR );
+}
+
+extern uint32 gAppServerSIMDFlags;
+
class BBitmap;
class BRegion;
class BGradient;
Added: haiku/trunk/src/servers/app/drawing/Painter/painter_bilinear_scale.nasm
===================================================================
--- haiku/trunk/src/servers/app/drawing/Painter/painter_bilinear_scale.nasm 2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/drawing/Painter/painter_bilinear_scale.nasm 2009-06-21 22:07:54 UTC (rev 31165)
@@ -0,0 +1,217 @@
+;
+; Copyright 2009, Christian Packmann.
+; All rights reserved.
+; Distributed under the terms of the MIT License, see
+; http://www.opensource.org/licenses/mit-license.php
+
+; Assembly code for Painter::_DrawBitmapBilinearCopy32() in Painter.cpp
+; This code implements only the inner x-loop, all other processing
+; is done in the C code.
+
+
+; ****** GENERAL NOTES *****
+
+; The implemented algorithm looks like this:
+; (pixLT * leftWeight + pixRT * rightWeight) * topWeight
+; +
+; (pixLB * leftWeight + pixRB * rightWeight) * bottomWeight
+;
+; with LT = LeftTop, RT = RightTop, LB = LeftBottom, RB = RightBottom
+;
+; For more detailed information, see the C implementation in
+; Painter.cpp
+;
+; Implementation notes:
+; The calculations are performed with 16-bit arithmetic. All values
+; are held in vars/registers as 8-bit values high-shifted by 8 bits;
+; i.e. 255<<8. This works because PMULHUW is used for MULs, and this
+; algorithm limits the variable values appropriately during all steps.
+; This will not work for all algorithms, so take note of that if you
+; want to recycle some of the code.
+
+; Notes on the code itself:
+; I've tried to keep the code small. That's why I'm using memory accesses
+; via index registers as much as possible. This costs execution time due
+; to the generated µops, but should minimize decode bandwidth pressure
+; due to the many MMX instructions.
+; Temporary variables are always stored to the stack instead of global
+; data space for this reason. So far I haven't exceeded 8-byte offsets,
+; so the instructions only need to encode a BYTE-offset instead of a DWORD.
+
+; Notes on code formatting/comments:
+; - integer and vector instructions are indented differently. I find this
+; helpful when parsing code, especially when I haven't looked at it for a
+; longer time.
+; - I've tried to comment the code so that it will be understandable and
+; maintainable in the future, and also by other persons than myself.
+; The current comments aren't yet fully standardized, I'm still working
+; on a coherent system for indicating the variables held within a register
+; which will help in understanding the data flow. Any suggestions
+; regarding this are welcome.
+; - Abbreviations for datatypes:
+; B = Byte 8 bit
+; W = Word 16 bit
+; DW = Doubleword 32 bit
+; QW = Quadword 64 bit
+; DQ = Doublequad 128 bit
+; A "p" in front of one of the datatypes signifies that the
+; variable/register is encoded in packed form; i.e. pW means
+; "packed Words"; four Words for a MMX register, 8 for a SSE register.
+; This should help in understanding the logical meaning of the data
+; transformations.
+; For better readability, the datatype indicator for a register is
+; breacketed with '#', a MMX register with 2 uint32 of value 255 would be
+; #pD# 255 255
+
+
+
+; ****** Global exports *****
+
+; Do NOT use '_' in front of your defines, this is done
+; with YASMs --prefix option at assembly time.
+GLOBAL bilinear_scale_xloop_mmxsse
+
+
+; ********************
+; ****** DATA ******
+; ********************
+SECTION .data
+
+DATA_SECTION:
+ALIGN 16
+DATA_SSSE3:
+; data which is identical for MMX and SSE code is shared by declaring
+; it as DQ but providing two labels. MMX code just accesses the
+; first half.
+c4x16UW_129_LShift8: TIMES 4 dw 129<<8
+c4x16UW_255_LShift8: TIMES 4 dw 255<<8
+c2x32UD_ff000000: TIMES 4 DD 0xff000000
+
+; Argument definitions
+
+; Parameter offsets assume "push ebp"
+PAR_srcPtr EQU 8
+PAR_dstPtr EQU 12
+PAR_xWeightPtr EQU 16
+PAR_xmin EQU 20
+PAR_xmax EQU 24
+PAR_wTop EQU 28
+PAR_srcBPR EQU 32
+
+; Stack storage definitions
+ST_Q_wTop EQU 0
+ST_Q_wBottom EQU 8
+ST_Q_c4x16UW_129_LShift8 EQU 16
+ST_Q_c4x16UW_255_LShift8 EQU 24
+ST_Q_lftWeight_A EQU 32
+ST_Q_rgtWeight_A EQU 40
+ST_Q_lftWeight_B EQU 48
+ST_Q_rgtWeight_B EQU 56
+
+
+; ********************
+; ****** CODE ******
+; ********************
+SECTION .code
+
+
+; void bilinear_scale_xloop_mmxsse(void* src, void* dst, void* xWeights,
+; uint32 xmin, uint32 xmax, uint16 wTop, uint32 srcBPR )
+; Loop stats:
+; 34 instructions (6 moves, 5 integer, 23 vector)
+; 12 memory accesses
+ALIGN 16
+bilinear_scale_xloop_mmxsse:
+ push ebp
+ mov ebp, esp
+ and esp, 0xfffffff8 ; align stack to 8-byte boundary
+ push ebx
+ push edi
+ push esi
+ sub esp, 4 + 32 ; +4 aligns to 8-byte boundary again; add 4 x QW
+; xmin > xmax?
+ mov eax, [ebp + PAR_xmin]
+ cmp eax, [ebp + PAR_xmax]
+ ja .exit
+; preparations
+ ; prepare wTop
+ mov eax, [ebp + PAR_wTop] ; #pB#: 0 0 0 top
+ shl eax, 8 ; #pB#: 0 0 top 0
+ movd mm0, eax ; #pW# 0 0 0 top
+ pshufw mm0, mm0, 00000000b ; #pW# top top top top
+ movq [esp + ST_Q_wTop], mm0
+ ; move constants
+ movq mm5, [c4x16UW_255_LShift8]
+ movq [esp + ST_Q_c4x16UW_255_LShift8], mm5
+ ; prepare wBottom
+ movq mm1, mm5 ; #pW# 255 255 255 255
+ psubw mm1, mm0 ; 255 - wTop = wBottom
+ movq [esp + ST_Q_wBottom], mm1
+
+; load params; leave ebx, ecx as scratch
+ mov eax, [ebp + PAR_xmin] ; loop counter
+ mov edx, [ebp + PAR_xWeightPtr] ; xWeights array
+ mov esi, [ebp + PAR_srcPtr] ; source bitmap
+ mov edi, [ebp + PAR_dstPtr] ; desination bitmap
+ movq mm6, [c4x16UW_129_LShift8]
+ movq mm7, [c2x32UD_ff000000]
+
+; main loop
+ALIGN 16
+.loop:
+ ; load Left/Right weights into mm0/mm1
+ movzx ebx, WORD [edx + eax*4 + 2] ; xWeights + x*4 + 2-> FilterInfo[x].weight
+ shl ebx, 8 ; #pB# 0 0 leftW 0
+ pxor mm2, mm2 ; clear before use
+ movd mm0, ebx ; #pW# 0 0 0 leftW
+ movq mm1, [esp + ST_Q_c4x16UW_255_LShift8]
+ pshufw mm0, mm0, 00000000b ; #pW# lW lW lW lW
+ psubw mm1, mm0 ; #pW# rW rW rW rW
+ movzx ecx, WORD [edx + eax*4] ; xWeights + x*4 -> FilterInfo[x].index
+ pxor mm3, mm3 ; clear before use
+ mov ebx, ecx
+ ; process top and bottom pixels, interleave instructions to avoid latencies
+ pxor mm4, mm4 ; clear before use
+ ; unpack pixel to high byte
+ punpcklbw mm2, [esi + ecx] ; pixLeftTop
+ ; unpack pixel to high byte
+ punpcklbw mm3, [esi + ecx + 4] ; pixRightTop
+
+ add ebx, [ebp + PAR_srcBPR] ; address:bottom pixels
+ pmulhuw mm2, mm0 ; pixLT * leftWeight
+ pmulhuw mm3, mm1 ; pixRT * rightWeight
+ ; calc address for bottom pix
+ pxor mm5, mm5 ; clear before use
+ punpcklbw mm4, [esi + ebx] ; pixLeftBottom
+ punpcklbw mm5, [esi + ebx + 4] ; pixRightBottom
+ pmulhuw mm4, mm0 ; pixLB * leftWeight
+ pmulhuw mm5, mm1 ; pixRB * rightWeight
+
+ paddw mm2, mm3 ; pixLT + pixRT
+ paddw mm4, mm5 ; pixLB + pixRB
+ pmulhuw mm2, [esp + ST_Q_wTop] ; * weightTop
+ pmulhuw mm4, [esp + ST_Q_wBottom] ; * weightBottom
+
+ ; add both temp results
+ paddw mm2, mm4
+ ; divide by 65025 using integer reciprocal: (*129 >> 7)
+ pmulhuw mm2, mm6
+ psrlw mm2, 7
+ ; pack & store
+ packuswb mm2, mm2
+ por mm2, mm7 ; | 0xff000000
+ movd [edi], mm2 ; store pixel as DWord
+ add edi, 4
+; loopctr <= xmax?
+ inc eax
+ cmp eax, [ebp + PAR_xmax]
+ jle .loop
+.exit:
+ emms ; Don't EVER forget to call EMMS!
+ add esp, 4 + 32 ; restore stack pointer
+ pop esi
+ pop edi
+ pop ebx
+ mov esp, ebp
+ pop ebp
+ ret
More information about the Haiku-commits
mailing list