[Haiku-commits] r31165 - in haiku/trunk/src/servers/app: . drawing/Painter

stippi at BerliOS stippi at mail.berlios.de
Mon Jun 22 00:07:56 CEST 2009


Author: stippi
Date: 2009-06-22 00:07:54 +0200 (Mon, 22 Jun 2009)
New Revision: 31165
ViewCVS: http://svn.berlios.de/viewcvs/haiku?rev=31165&view=rev

Added:
   haiku/trunk/src/servers/app/drawing/Painter/painter_bilinear_scale.nasm
Modified:
   haiku/trunk/src/servers/app/AppServer.cpp
   haiku/trunk/src/servers/app/AppServer.h
   haiku/trunk/src/servers/app/drawing/Painter/Jamfile
   haiku/trunk/src/servers/app/drawing/Painter/Painter.cpp
   haiku/trunk/src/servers/app/drawing/Painter/Painter.h
Log:
Patch by Christian Packmann:
* Implemented a CPU feature detection function in AppServer.cpp.
  The results are put into the global variable gAppServerSIMDFlags.
* Implemented an SIMD accelerated version of the bilinear bitmap
  scaling code that is the backend of BView::DrawBitmap(...,
  uint32 options) used by the MediaPlayer to smoothly upscale
  movies when no video overlay is available. The speed up is very
  noticable and a Core 2 Duo @ 1.8 GHz can play at 1920x1200 now
  without breaking a sweat. There is currently one SIMD version
  implemented which uses MMX and plain SSE.

Very cool! Thanks a lot!


Modified: haiku/trunk/src/servers/app/AppServer.cpp
===================================================================
--- haiku/trunk/src/servers/app/AppServer.cpp	2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/AppServer.cpp	2009-06-21 22:07:54 UTC (rev 31165)
@@ -6,6 +6,7 @@
  *		DarkWyrm <bpmagic at columbus.rr.com>
  *		Axel Dörfler, axeld at pinc-software.de
  *		Stephan Aßmus <superstippi at gmx.de>
+ * 		Christian Packmann
  */
 
 
@@ -35,16 +36,77 @@
 port_id gAppServerPort;
 static AppServer *sAppServer;
 BTokenSpace gTokenSpace;
+uint32 gAppServerSIMDFlags = 0;
 
 
+/*! Detect SIMD flags for use in AppServer. Checks all CPUs in the system
+	and chooses the minimum supported set of instructions. */
+static void
+detect_simd()
+{
+	// Only scan CPUs for which we are certain the SIMD flags are properly
+	// defined.
+	char* vendorNames[] = {
+		"GenuineIntel",
+		"AuthenticAMD",
+		"CentaurHauls", // Via CPUs, MMX and SSE support
+		"RiseRiseRise", // should be MMX-only
+		"CyrixInstead", // MMX-only, but custom MMX extensions
+		"GenuineTMx86", // MMX and SSE
+		0
+	};
+
+	system_info sysInfo;
+	if (get_system_info(&sysInfo) != B_OK || sysInfo.cpu_count < 1)
+		return;
+
+	// We start out with all flags set and end up with only those flags
+	// supported across all CPUs found.
+	uint32 appServerSIMD = 0xffffffff;
+
+	for (int32 cpu = 0; cpu < sysInfo.cpu_count; cpu++) {
+		cpuid_info cpuInfo;
+		get_cpuid(&cpuInfo, 0, cpu);
+
+		// Get the vendor string and terminate it manually
+		char vendor[13];
+		memcpy(vendor, cpuInfo.eax_0.vendor_id, 12);
+		vendor[12] = 0;
+
+		bool vendorFound = false;
+		for (uint32 i = 0; vendorNames[i] != 0; i++) {
+			if (strcmp(vendor, vendorNames[i]) == 0)
+				vendorFound = true;
+		}
+
+		uint32 cpuSIMD = 0;
+		uint32 maxStdFunc = cpuInfo.regs.eax;
+		if (vendorFound && maxStdFunc >= 1) {
+			get_cpuid(&cpuInfo, 1, 0);
+			uint32 edx = cpuInfo.regs.edx;
+			if (edx & (1 << 23))
+				cpuSIMD |= APPSERVER_SIMD_MMX;
+			if (edx & (1 << 25))
+				cpuSIMD |= APPSERVER_SIMD_SSE;
+		} else {
+			// no flags can be identified
+			cpuSIMD = 0;
+		}
+		appServerSIMD &= cpuSIMD;
+	}
+	gAppServerSIMDFlags = appServerSIMD;
+}
+
+
 /*!
 	\brief Constructor
-	
+
 	This loads the default fonts, allocates all the major global variables, spawns the main housekeeping
 	threads, loads user preferences for the UI and decorator, and allocates various locks.
 */
 AppServer::AppServer()
-	: MessageLooper("app_server"),
+	:
+	MessageLooper("app_server"),
 	fMessagePort(-1),
 	fDesktops(),
 	fDesktopLock("AppServerDesktopLock")
@@ -70,10 +132,12 @@
 
 	gScreenManager = new ScreenManager();
 	gScreenManager->Run();
-	
+
 	// Create the bitmap allocator. Object declared in BitmapManager.cpp
 	gBitmapManager = new BitmapManager();
 
+	// Initialize SIMD flags
+	detect_simd();
 #if 0
 	_LaunchCursorThread();
 #endif
@@ -148,7 +212,7 @@
 AppServer::_FindDesktop(uid_t userID)
 {
 	BAutolock locker(fDesktopLock);
-	
+
 	for (int32 i = 0; i < fDesktops.CountItems(); i++) {
 		Desktop* desktop = fDesktops.ItemAt(i);
 
@@ -164,7 +228,7 @@
 	\brief Message handling function for all messages sent to the app_server
 	\param code ID of the message sent
 	\param buffer Attachment buffer for the message.
-	
+
 */
 void
 AppServer::_DispatchMessage(int32 code, BPrivate::LinkReceiver& msg)

Modified: haiku/trunk/src/servers/app/AppServer.h
===================================================================
--- haiku/trunk/src/servers/app/AppServer.h	2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/AppServer.h	2009-06-21 22:07:54 UTC (rev 31165)
@@ -56,5 +56,10 @@
 
 extern BitmapManager *gBitmapManager;
 extern port_id gAppServerPort;
+extern uint32 gAppServerSIMDFlags;
 
+// Defines for SIMD support. Early implementation, subject to change
+#define APPSERVER_SIMD_MMX	(1 << 0)
+#define APPSERVER_SIMD_SSE	(1 << 1)
+
 #endif	/* APP_SERVER_H */

Modified: haiku/trunk/src/servers/app/drawing/Painter/Jamfile
===================================================================
--- haiku/trunk/src/servers/app/drawing/Painter/Jamfile	2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/drawing/Painter/Jamfile	2009-06-21 22:07:54 UTC (rev 31165)
@@ -21,4 +21,6 @@
 	PixelFormat.cpp
 
 	AGGTextRenderer.cpp
+
+	painter_bilinear_scale.nasm
 ;

Modified: haiku/trunk/src/servers/app/drawing/Painter/Painter.cpp
===================================================================
--- haiku/trunk/src/servers/app/drawing/Painter/Painter.cpp	2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/drawing/Painter/Painter.cpp	2009-06-21 22:07:54 UTC (rev 31165)
@@ -1,12 +1,13 @@
 /*
- * Copyright 2005-2007, Stephan Aßmus <superstippi at gmx.de>.
+ * Copyright 2009, Christian Packmann.
  * Copyright 2008, Andrej Spielmann <andrej.spielmann at seh.ox.ac.uk>.
+ * Copyright 2005-2009, Stephan Aßmus <superstippi at gmx.de>.
  * All rights reserved. Distributed under the terms of the MIT License.
- *
- * API to the Anti-Grain Geometry based "Painter" drawing backend. Manages
- * rendering pipe-lines for stroke, fills, bitmap and text rendering.
  */
 
+/*! API to the Anti-Grain Geometry based "Painter" drawing backend. Manages
+	rendering pipe-lines for stroke, fills, bitmap and text rendering.*/
+
 #include <new>
 #include <stdio.h>
 #include <string.h>
@@ -54,6 +55,8 @@
 
 #include "Painter.h"
 
+#include "AppServer.h"
+
 using std::nothrow;
 
 #undef TRACE
@@ -78,41 +81,43 @@
 
 // constructor
 Painter::Painter()
-	: fBuffer(),
-	  fPixelFormat(fBuffer, &fPatternHandler),
-	  fBaseRenderer(fPixelFormat),
-	  fUnpackedScanline(),
-	  fPackedScanline(),
-	  fSubpixPackedScanline(),
-	  fSubpixUnpackedScanline(),
-	  fSubpixRasterizer(),
-	  fRasterizer(),
-	  fSubpixRenderer(fBaseRenderer),
-	  fRenderer(fBaseRenderer),
-	  fRendererBin(fBaseRenderer),
+	:
+	fBuffer(),
+	fPixelFormat(fBuffer, &fPatternHandler),
+	fBaseRenderer(fPixelFormat),
+	fUnpackedScanline(),
+	fPackedScanline(),
+	fSubpixPackedScanline(),
+	fSubpixUnpackedScanline(),
+	fSubpixRasterizer(),
+	fRasterizer(),
+	fSubpixRenderer(fBaseRenderer),
+	fRenderer(fBaseRenderer),
+	fRendererBin(fBaseRenderer),
 
-	  fPath(),
-	  fCurve(fPath),
+	fPath(),
+	fCurve(fPath),
 
-	  fSubpixelPrecise(false),
-	  fValidClipping(false),
-	  fDrawingText(false),
-	  fAttached(false),
+	fSubpixelPrecise(false),
+	fValidClipping(false),
+	fDrawingText(false),
+	fAttached(false),
 
-	  fPenSize(1.0),
-	  fClippingRegion(NULL),
-	  fDrawingMode(B_OP_COPY),
-	  fAlphaSrcMode(B_PIXEL_ALPHA),
-	  fAlphaFncMode(B_ALPHA_OVERLAY),
-	  fLineCapMode(B_BUTT_CAP),
-	  fLineJoinMode(B_MITER_JOIN),
-	  fMiterLimit(B_DEFAULT_MITER_LIMIT),
+	fPenSize(1.0),
+	fClippingRegion(NULL),
+	fDrawingMode(B_OP_COPY),
+	fAlphaSrcMode(B_PIXEL_ALPHA),
+	fAlphaFncMode(B_ALPHA_OVERLAY),
+	fLineCapMode(B_BUTT_CAP),
+	fLineJoinMode(B_MITER_JOIN),
+	fMiterLimit(B_DEFAULT_MITER_LIMIT),
 
-	  fPatternHandler(),
-	  fTextRenderer(fSubpixRenderer, fRenderer, fRendererBin, fUnpackedScanline,
+	fPatternHandler(),
+	fTextRenderer(fSubpixRenderer, fRenderer, fRendererBin, fUnpackedScanline,
 		fSubpixUnpackedScanline, fSubpixRasterizer)
 {
-	fPixelFormat.SetDrawingMode(fDrawingMode, fAlphaSrcMode, fAlphaFncMode, false);
+	fPixelFormat.SetDrawingMode(fDrawingMode, fAlphaSrcMode, fAlphaFncMode,
+		false);
 
 #if ALIASED_DRAWING
 	fRasterizer.gamma(agg::gamma_threshold(0.5));
@@ -131,8 +136,9 @@
 void
 Painter::AttachToBuffer(RenderingBuffer* buffer)
 {
-	if (buffer && buffer->InitCheck() >= B_OK &&
-		(buffer->ColorSpace() == B_RGBA32 || buffer->ColorSpace() == B_RGB32)) {
+	if (buffer && buffer->InitCheck() >= B_OK
+		&& (buffer->ColorSpace() == B_RGBA32
+			|| buffer->ColorSpace() == B_RGB32)) {
 		// TODO: implement drawing on B_RGB24, B_RGB15, B_RGB16,
 		// B_CMAP8 and B_GRAY8 :-[
 		// (if ever we want to support some devices where this gives
@@ -2207,9 +2213,25 @@
 	const uint32 dstBPR = fBuffer.stride();
 	const uint32 srcBPR = srcBuffer.stride();
 
-	bool optimizeForLowFilterRatio = xScale == yScale
-		&& (xScale == 1.5 || xScale == 2.0 || xScale == 2.5 || xScale == 3.0);
+	// Figure out which version of the code we want to use...
+	enum {
+		kOptimizeForLowFilterRatio = 0,
+		kUsePlainCVersion,
+		kUseSIMDVersion
+	};
 
+	int codeSelect = kUsePlainCVersion;
+
+	uint32 neededSIMDFlags = (APPSERVER_SIMD_MMX | APPSERVER_SIMD_SSE);
+	if ((gAppServerSIMDFlags & neededSIMDFlags) == neededSIMDFlags)
+		codeSelect = kUseSIMDVersion;
+	else {
+		if (xScale == yScale && (xScale == 1.5 || xScale == 2.0
+			|| xScale == 2.5 || xScale == 3.0)) {
+			codeSelect = kOptimizeForLowFilterRatio;
+		}
+	}
+
 	// iterate over clipping boxes
 	fBaseRenderer.first_clip_box();
 	do {
@@ -2236,161 +2258,248 @@
 //printf("x: %ld - %ld\n", xIndexL, xIndexR);
 //printf("y: %ld - %ld\n", y1, y2);
 
-		if (optimizeForLowFilterRatio) {
-			// In this mode, we anticipate to hit many destination pixels that
-			// map directly to a source pixel, we have more branches in the
-			// inner loop but save time because of the special cases. If there
-			// are too few direct hit pixels, the branches only waste time.
-			for (; y1 <= y2; y1++) {
-				// cache the weight of the top and bottom row
-				const uint16 wTop = yWeights[y1].weight;
-				const uint16 wBottom = 255 - yWeights[y1].weight;
+		switch (codeSelect) {
+			case kOptimizeForLowFilterRatio:
+			{
+				// In this mode, we anticipate to hit many destination pixels
+				// that map directly to a source pixel, we have more branches
+				// in the inner loop but save time because of the special
+				// cases. If there are too few direct hit pixels, the branches
+				// only waste time.
+				for (; y1 <= y2; y1++) {
+					// cache the weight of the top and bottom row
+					const uint16 wTop = yWeights[y1].weight;
+					const uint16 wBottom = 255 - yWeights[y1].weight;
 
-				// buffer offset into source (top row)
-				register const uint8* src
-					= srcBuffer.row_ptr(yWeights[y1].index);
-				// buffer handle for destination to be incremented per pixel
-				register uint8* d = dst;
+					// buffer offset into source (top row)
+					register const uint8* src
+						= srcBuffer.row_ptr(yWeights[y1].index);
+					// buffer handle for destination to be incremented per
+					// pixel
+					register uint8* d = dst;
 
-				if (wTop == 255) {
-					for (int32 x = xIndexL; x <= xIndexR; x++) {
-						const uint8* s = src + xWeights[x].index;
-						// This case is important to prevent out
-						// of bounds access at bottom edge of the source
-						// bitmap. If the scale is low and integer, it will
-						// also help the speed.
-						if (xWeights[x].weight == 255) {
-							// As above, but to prevent out of bounds
-							// on the right edge.
-							*(uint32*)d = *(uint32*)s;
-						} else {
-							// Only the left and right pixels are interpolated,
-							// since the top row has 100% weight.
-							const uint16 wLeft = xWeights[x].weight;
-							const uint16 wRight = 255 - wLeft;
-							d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
-							d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
-							d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
+					if (wTop == 255) {
+						for (int32 x = xIndexL; x <= xIndexR; x++) {
+							const uint8* s = src + xWeights[x].index;
+							// This case is important to prevent out
+							// of bounds access at bottom edge of the source
+							// bitmap. If the scale is low and integer, it will
+							// also help the speed.
+							if (xWeights[x].weight == 255) {
+								// As above, but to prevent out of bounds
+								// on the right edge.
+								*(uint32*)d = *(uint32*)s;
+							} else {
+								// Only the left and right pixels are
+								// interpolated, since the top row has 100%
+								// weight.
+								const uint16 wLeft = xWeights[x].weight;
+								const uint16 wRight = 255 - wLeft;
+								d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
+								d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
+								d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
+							}
+							d += 4;
 						}
-						d += 4;
-					}
-				} else {
-					for (int32 x = xIndexL; x <= xIndexR; x++) {
-						const uint8* s = src + xWeights[x].index;
-						if (xWeights[x].weight == 255) {
-							// Prevent out of bounds access on the right edge
-							// or simply speed up.
-							const uint8* sBottom = s + srcBPR;
-							d[0] = (s[0] * wTop + sBottom[0] * wBottom) >> 8;
-							d[1] = (s[1] * wTop + sBottom[1] * wBottom) >> 8;
-							d[2] = (s[2] * wTop + sBottom[2] * wBottom) >> 8;
-						} else {
-							// calculate the weighted sum of all four
-							// interpolated pixels
-							const uint16 wLeft = xWeights[x].weight;
-							const uint16 wRight = 255 - wLeft;
-							// left and right of top row
-							uint32 t0 = (s[0] * wLeft + s[4] * wRight) * wTop;
-							uint32 t1 = (s[1] * wLeft + s[5] * wRight) * wTop;
-							uint32 t2 = (s[2] * wLeft + s[6] * wRight) * wTop;
+					} else {
+						for (int32 x = xIndexL; x <= xIndexR; x++) {
+							const uint8* s = src + xWeights[x].index;
+							if (xWeights[x].weight == 255) {
+								// Prevent out of bounds access on the right
+								// edge or simply speed up.
+								const uint8* sBottom = s + srcBPR;
+								d[0] = (s[0] * wTop + sBottom[0] * wBottom)
+									>> 8;
+								d[1] = (s[1] * wTop + sBottom[1] * wBottom)
+									>> 8;
+								d[2] = (s[2] * wTop + sBottom[2] * wBottom)
+									>> 8;
+							} else {
+								// calculate the weighted sum of all four
+								// interpolated pixels
+								const uint16 wLeft = xWeights[x].weight;
+								const uint16 wRight = 255 - wLeft;
+								// left and right of top row
+								uint32 t0 = (s[0] * wLeft + s[4] * wRight)
+									* wTop;
+								uint32 t1 = (s[1] * wLeft + s[5] * wRight)
+									* wTop;
+								uint32 t2 = (s[2] * wLeft + s[6] * wRight)
+									* wTop;
 
-							// left and right of bottom row
-							s += srcBPR;
-							t0 += (s[0] * wLeft + s[4] * wRight) * wBottom;
-							t1 += (s[1] * wLeft + s[5] * wRight) * wBottom;
-							t2 += (s[2] * wLeft + s[6] * wRight) * wBottom;
+								// left and right of bottom row
+								s += srcBPR;
+								t0 += (s[0] * wLeft + s[4] * wRight) * wBottom;
+								t1 += (s[1] * wLeft + s[5] * wRight) * wBottom;
+								t2 += (s[2] * wLeft + s[6] * wRight) * wBottom;
 
-							d[0] = t0 >> 16;
-							d[1] = t1 >> 16;
-							d[2] = t2 >> 16;
+								d[0] = t0 >> 16;
+								d[1] = t1 >> 16;
+								d[2] = t2 >> 16;
+							}
+							d += 4;
 						}
-						d += 4;
 					}
+					dst += dstBPR;
 				}
-				dst += dstBPR;
+				break;
 			}
-		} else {
-			// In this mode we anticipate many pixels wich need filtering,
-			// there are no special cases for direct hit pixels except for the
-			// last column/row and the right/bottom corner pixel.
 
-			// The last column/row handling does not need to be performed
-			// for all clipping rects!
-			int32 yMax = y2;
-			if (yWeights[yMax].weight == 255)
-				yMax--;
-			int32 xIndexMax = xIndexR;
-			if (xWeights[xIndexMax].weight == 255)
-				xIndexMax--;
+			case kUsePlainCVersion:
+			{
+				// In this mode we anticipate many pixels wich need filtering,
+				// there are no special cases for direct hit pixels except for
+				// the last column/row and the right/bottom corner pixel.
 
-			for (; y1 <= yMax; y1++) {
-				// cache the weight of the top and bottom row
-				const uint16 wTop = yWeights[y1].weight;
-				const uint16 wBottom = 255 - yWeights[y1].weight;
+				// The last column/row handling does not need to be performed
+				// for all clipping rects!
+				int32 yMax = y2;
+				if (yWeights[yMax].weight == 255)
+					yMax--;
+				int32 xIndexMax = xIndexR;
+				if (xWeights[xIndexMax].weight == 255)
+					xIndexMax--;
 
-				// buffer offset into source (top row)
+				for (; y1 <= yMax; y1++) {
+					// cache the weight of the top and bottom row
+					const uint16 wTop = yWeights[y1].weight;
+					const uint16 wBottom = 255 - yWeights[y1].weight;
+
+					// buffer offset into source (top row)
+					register const uint8* src
+						= srcBuffer.row_ptr(yWeights[y1].index);
+					// buffer handle for destination to be incremented per
+					// pixel
+					register uint8* d = dst;
+
+					for (int32 x = xIndexL; x <= xIndexMax; x++) {
+						const uint8* s = src + xWeights[x].index;
+						// calculate the weighted sum of all four
+						// interpolated pixels
+						const uint16 wLeft = xWeights[x].weight;
+						const uint16 wRight = 255 - wLeft;
+						// left and right of top row
+						uint32 t0 = (s[0] * wLeft + s[4] * wRight) * wTop;
+						uint32 t1 = (s[1] * wLeft + s[5] * wRight) * wTop;
+						uint32 t2 = (s[2] * wLeft + s[6] * wRight) * wTop;
+
+						// left and right of bottom row
+						s += srcBPR;
+						t0 += (s[0] * wLeft + s[4] * wRight) * wBottom;
+						t1 += (s[1] * wLeft + s[5] * wRight) * wBottom;
+						t2 += (s[2] * wLeft + s[6] * wRight) * wBottom;
+						d[0] = t0 >> 16;
+						d[1] = t1 >> 16;
+						d[2] = t2 >> 16;
+						d += 4;
+					}
+					// last column of pixels if necessary
+					if (xIndexMax < xIndexR) {
+						const uint8* s = src + xWeights[xIndexR].index;
+						const uint8* sBottom = s + srcBPR;
+						d[0] = (s[0] * wTop + sBottom[0] * wBottom) >> 8;
+						d[1] = (s[1] * wTop + sBottom[1] * wBottom) >> 8;
+						d[2] = (s[2] * wTop + sBottom[2] * wBottom) >> 8;
+					}
+
+					dst += dstBPR;
+				}
+
+				// last row of pixels if necessary
+				// buffer offset into source (bottom row)
 				register const uint8* src
-					= srcBuffer.row_ptr(yWeights[y1].index);
+					= srcBuffer.row_ptr(yWeights[y2].index);
 				// buffer handle for destination to be incremented per pixel
 				register uint8* d = dst;
 
-				for (int32 x = xIndexL; x <= xIndexMax; x++) {
-					const uint8* s = src + xWeights[x].index;
-					// calculate the weighted sum of all four
-					// interpolated pixels
-					const uint16 wLeft = xWeights[x].weight;
-					const uint16 wRight = 255 - wLeft;
-					// left and right of top row
-					uint32 t0 = (s[0] * wLeft + s[4] * wRight) * wTop;
-					uint32 t1 = (s[1] * wLeft + s[5] * wRight) * wTop;
-					uint32 t2 = (s[2] * wLeft + s[6] * wRight) * wTop;
+				if (yMax < y2) {
+					for (int32 x = xIndexL; x <= xIndexMax; x++) {
+						const uint8* s = src + xWeights[x].index;
+						const uint16 wLeft = xWeights[x].weight;
+						const uint16 wRight = 255 - wLeft;
+						d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
+						d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
+						d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
+						d += 4;
+					}
+				}
 
-					// left and right of bottom row
-					s += srcBPR;
-					t0 += (s[0] * wLeft + s[4] * wRight) * wBottom;
-					t1 += (s[1] * wLeft + s[5] * wRight) * wBottom;
-					t2 += (s[2] * wLeft + s[6] * wRight) * wBottom;
-					d[0] = t0 >> 16;
-					d[1] = t1 >> 16;
-					d[2] = t2 >> 16;
-					d += 4;
-				}
-				// last column of pixels if necessary
-				if (xIndexMax < xIndexR) {
+				// pixel in bottom right corner if necessary
+				if (yMax < y2 && xIndexMax < xIndexR) {
 					const uint8* s = src + xWeights[xIndexR].index;
-					const uint8* sBottom = s + srcBPR;
-					d[0] = (s[0] * wTop + sBottom[0] * wBottom) >> 8;
-					d[1] = (s[1] * wTop + sBottom[1] * wBottom) >> 8;
-					d[2] = (s[2] * wTop + sBottom[2] * wBottom) >> 8;
+					*(uint32*)d = *(uint32*)s;
 				}
-
-				dst += dstBPR;
+				break;
 			}
 
-			// last row of pixels if necessary
-			// buffer offset into source (bottom row)
-			register const uint8* src = srcBuffer.row_ptr(yWeights[y2].index);
-			// buffer handle for destination to be incremented per pixel
-			register uint8* d = dst;
+			case kUseSIMDVersion:
+			{
+				// Basically the same as the "standard" mode, but we use SIMD
+				// routines for the processing of the single display lines.
 
-			if (yMax < y2) {
-				for (int32 x = xIndexL; x <= xIndexMax; x++) {
-					const uint8* s = src + xWeights[x].index;
-					const uint16 wLeft = xWeights[x].weight;
-					const uint16 wRight = 255 - wLeft;
-					d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
-					d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
-					d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
-					d += 4;
+				// The last column/row handling does not need to be performed
+				// for all clipping rects!
+				int32 yMax = y2;
+				if (yWeights[yMax].weight == 255)
+					yMax--;
+				int32 xIndexMax = xIndexR;
+				if (xWeights[xIndexMax].weight == 255)
+					xIndexMax--;
+
+				for (; y1 <= yMax; y1++) {
+					// cache the weight of the top and bottom row
+					const uint16 wTop = yWeights[y1].weight;
+					const uint16 wBottom = 255 - yWeights[y1].weight;
+
+					// buffer offset into source (top row)
+					const uint8* src = srcBuffer.row_ptr(yWeights[y1].index);
+					// buffer handle for destination to be incremented per
+					// pixel
+					uint8* d = dst;
+					bilinear_scale_xloop_mmxsse(src, dst, xWeights,	xIndexL,
+						xIndexMax, wTop, srcBPR);
+					// increase pointer by processed pixels
+					d += (xIndexMax - xIndexL + 1) * 4;
+
+					// last column of pixels if necessary
+					if (xIndexMax < xIndexR) {
+						const uint8* s = src + xWeights[xIndexR].index;
+						const uint8* sBottom = s + srcBPR;
+						d[0] = (s[0] * wTop + sBottom[0] * wBottom) >> 8;
+						d[1] = (s[1] * wTop + sBottom[1] * wBottom) >> 8;
+						d[2] = (s[2] * wTop + sBottom[2] * wBottom) >> 8;
+					}
+
+					dst += dstBPR;
 				}
-			}
 
-			// pixel in bottom right corner if necessary
-			if (yMax < y2 && xIndexMax < xIndexR) {
-				const uint8* s = src + xWeights[xIndexR].index;
-				*(uint32*)d = *(uint32*)s;
+				// last row of pixels if necessary
+				// buffer offset into source (bottom row)
+				register const uint8* src
+					= srcBuffer.row_ptr(yWeights[y2].index);
+				// buffer handle for destination to be incremented per pixel
+				register uint8* d = dst;
+
+				if (yMax < y2) {
+					for (int32 x = xIndexL; x <= xIndexMax; x++) {
+						const uint8* s = src + xWeights[x].index;
+						const uint16 wLeft = xWeights[x].weight;
+						const uint16 wRight = 255 - wLeft;
+						d[0] = (s[0] * wLeft + s[4] * wRight) >> 8;
+						d[1] = (s[1] * wLeft + s[5] * wRight) >> 8;
+						d[2] = (s[2] * wLeft + s[6] * wRight) >> 8;
+						d += 4;
+					}
+				}
+
+				// pixel in bottom right corner if necessary
+				if (yMax < y2 && xIndexMax < xIndexR) {
+					const uint8* s = src + xWeights[xIndexR].index;
+					*(uint32*)d = *(uint32*)s;
+				}
+				break;
 			}
-		}
+		} // switch(codeselect)
 	} while (fBaseRenderer.next_clip_box());
 
 #ifdef FILTER_INFOS_ON_HEAP

Modified: haiku/trunk/src/servers/app/drawing/Painter/Painter.h
===================================================================
--- haiku/trunk/src/servers/app/drawing/Painter/Painter.h	2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/drawing/Painter/Painter.h	2009-06-21 22:07:54 UTC (rev 31165)
@@ -24,6 +24,15 @@
 #include <Font.h>
 #include <Rect.h>
 
+
+// Prototypes for assembler routines
+extern "C" {
+	void bilinear_scale_xloop_mmxsse(const uint8* src, void* dst, void* xWeights,
+		uint32 xmin, uint32 xmax, uint32 wTop, uint32 srcBPR );
+}
+
+extern uint32 gAppServerSIMDFlags;
+
 class BBitmap;
 class BRegion;
 class BGradient;

Added: haiku/trunk/src/servers/app/drawing/Painter/painter_bilinear_scale.nasm
===================================================================
--- haiku/trunk/src/servers/app/drawing/Painter/painter_bilinear_scale.nasm	2009-06-21 21:29:57 UTC (rev 31164)
+++ haiku/trunk/src/servers/app/drawing/Painter/painter_bilinear_scale.nasm	2009-06-21 22:07:54 UTC (rev 31165)
@@ -0,0 +1,217 @@
+;
+; Copyright 2009, Christian Packmann.
+; All rights reserved.
+; Distributed under the terms of the MIT License, see
+; http://www.opensource.org/licenses/mit-license.php
+
+; Assembly code for Painter::_DrawBitmapBilinearCopy32() in Painter.cpp
+; This code implements only the inner x-loop, all other processing
+; is done in the C code.
+
+
+; ******  GENERAL NOTES  *****
+
+; The implemented algorithm looks like this:
+; (pixLT * leftWeight  +  pixRT * rightWeight) * topWeight
+;                            +
+; (pixLB * leftWeight  +  pixRB * rightWeight) * bottomWeight
+;
+; with LT = LeftTop, RT = RightTop, LB = LeftBottom, RB = RightBottom
+;
+; For more detailed information, see the C implementation in
+; Painter.cpp
+;
+; Implementation notes:
+; The calculations are performed with 16-bit arithmetic. All values
+; are held in vars/registers as 8-bit values high-shifted by 8 bits;
+; i.e. 255<<8. This works because PMULHUW is used for MULs, and this
+; algorithm limits the variable values appropriately during all steps.
+; This will not work for all algorithms, so take note of that if you
+; want to recycle some of the code.
+
+; Notes on the code itself:
+; I've tried to keep the code small. That's why I'm using memory accesses
+; via index registers as much as possible. This costs execution time due
+; to the generated µops, but should minimize decode bandwidth pressure
+; due to the many MMX instructions.
+; Temporary variables are always stored to the stack instead of global
+; data space for this reason. So far I haven't exceeded 8-byte offsets,
+; so the instructions only need to encode a BYTE-offset instead of a DWORD.
+
+; Notes on code formatting/comments:
+; - integer and vector instructions are indented differently. I find this
+;   helpful when parsing code, especially when I haven't looked at it for a
+;   longer time.
+; - I've tried to comment the code so that it will be understandable and
+;   maintainable in the future, and also by other persons than myself.
+;   The current comments aren't yet fully standardized, I'm still working
+;   on a coherent system for indicating the variables held within a register
+;   which will help in understanding the data flow. Any suggestions
+;   regarding this are welcome.
+; - Abbreviations for datatypes:
+;   B = Byte		  8 bit
+;   W = Word		 16 bit
+;   DW = Doubleword	 32 bit
+;   QW = Quadword	 64 bit
+;	DQ = Doublequad	128 bit
+;	A "p" in front of one of the datatypes signifies that the
+;	variable/register is encoded in packed form; i.e. pW means
+;	"packed Words"; four Words for a MMX register, 8 for a SSE register.
+;	This should help in understanding the logical meaning of the data
+;	transformations.
+;	For better readability, the datatype indicator for a register is
+;   breacketed with '#', a MMX register with 2 uint32 of value 255 would be
+;   #pD# 255 255
+
+
+
+; ******  Global exports  *****
+
+; Do NOT use '_' in front of your defines, this is done
+; with YASMs --prefix option at assembly time.
+GLOBAL bilinear_scale_xloop_mmxsse
+
+
+; ********************
+; ******  DATA  ******
+; ********************
+SECTION .data
+
+DATA_SECTION:
+ALIGN 16
+DATA_SSSE3:
+; data which is identical for MMX and SSE code is shared by declaring
+; it as DQ but providing two labels. MMX code just accesses the
+; first half.
+c4x16UW_129_LShift8: 	TIMES 4 dw 129<<8
+c4x16UW_255_LShift8:	TIMES 4 dw 255<<8
+c2x32UD_ff000000:		TIMES 4 DD 0xff000000
+
+; Argument definitions
+
+; Parameter offsets assume "push ebp"
+PAR_srcPtr EQU 	8
+PAR_dstPtr EQU 	12
+PAR_xWeightPtr EQU 16
+PAR_xmin EQU 	20
+PAR_xmax EQU 	24
+PAR_wTop EQU 	28
+PAR_srcBPR EQU 	32
+
+; Stack storage definitions
+ST_Q_wTop					EQU 0
+ST_Q_wBottom				EQU 8
+ST_Q_c4x16UW_129_LShift8	EQU 16
+ST_Q_c4x16UW_255_LShift8	EQU 24
+ST_Q_lftWeight_A			EQU 32
+ST_Q_rgtWeight_A			EQU 40
+ST_Q_lftWeight_B			EQU 48
+ST_Q_rgtWeight_B			EQU 56
+
+
+; ********************
+; ******  CODE  ******
+; ********************
+SECTION .code
+
+
+; void bilinear_scale_xloop_mmxsse(void* src, void* dst, void* xWeights,
+;				uint32 xmin, uint32 xmax, uint16 wTop, uint32 srcBPR )
+; Loop stats:
+;		34 instructions (6 moves, 5 integer, 23 vector)
+; 		12 memory accesses
+ALIGN 16
+bilinear_scale_xloop_mmxsse:
+	push	ebp
+	mov		ebp, esp
+	and		esp, 0xfffffff8	; align stack to 8-byte boundary
+	push	ebx
+	push	edi
+	push	esi
+	sub		esp, 4 + 32	; +4 aligns to 8-byte boundary again; add 4 x QW
+; xmin > xmax?
+	mov		eax, [ebp + PAR_xmin]
+	cmp		eax, [ebp + PAR_xmax]
+	ja		.exit
+; preparations
+	; prepare wTop
+	mov		eax, [ebp + PAR_wTop]	; #pB#: 0 0 0 top
+	shl		eax, 8					; #pB#: 0 0 top 0
+	movd		mm0, eax			; #pW# 0 0 0 top
+	pshufw		mm0, mm0, 00000000b	; #pW# top top top top
+	movq		[esp + ST_Q_wTop], mm0
+	; move constants
+	movq		mm5, [c4x16UW_255_LShift8]
+	movq		[esp + ST_Q_c4x16UW_255_LShift8], mm5
+	; prepare wBottom
+	movq		mm1, mm5	; #pW# 255 255 255 255
+	psubw		mm1, mm0	; 255 - wTop = wBottom
+	movq		[esp + ST_Q_wBottom], mm1
+
+; load params; leave ebx, ecx as scratch
+	mov		eax, [ebp + PAR_xmin]	; loop counter
+	mov		edx, [ebp + PAR_xWeightPtr]	; xWeights array
+	mov		esi, [ebp + PAR_srcPtr]		; source bitmap
+	mov		edi, [ebp + PAR_dstPtr]		; desination bitmap
+	movq	mm6, [c4x16UW_129_LShift8]
+	movq	mm7, [c2x32UD_ff000000]
+
+; main loop
+ALIGN 16
+.loop:
+	; load Left/Right weights into mm0/mm1
+	movzx	ebx, WORD [edx + eax*4 + 2] ; xWeights + x*4 + 2-> FilterInfo[x].weight
+	shl			ebx, 8		; #pB# 0 0 leftW 0
+	pxor		mm2, mm2	; clear before use
+	movd		mm0, ebx	; #pW# 0 0 0 leftW
+	movq		mm1, [esp + ST_Q_c4x16UW_255_LShift8]
+	pshufw		mm0, mm0, 00000000b	; #pW# lW lW lW lW
+	psubw		mm1, mm0			; #pW# rW rW rW rW
+	movzx	ecx, WORD [edx + eax*4] ; xWeights + x*4 -> FilterInfo[x].index
+	pxor		mm3, mm3	; clear before use
+	mov		ebx, ecx
+	; process top and bottom pixels, interleave instructions to avoid latencies
+	pxor		mm4, mm4	; clear before use
+	; unpack pixel to high byte
+	punpcklbw	mm2, [esi + ecx]	; pixLeftTop
+	; unpack pixel to high byte
+	punpcklbw	mm3, [esi + ecx + 4] ; pixRightTop
+
+	add		ebx, [ebp + PAR_srcBPR]	; address:bottom pixels
+	pmulhuw		mm2, mm0	; pixLT * leftWeight
+	pmulhuw		mm3, mm1	; pixRT * rightWeight
+	; calc address for bottom pix
+	pxor		mm5, mm5	; clear before use
+	punpcklbw	mm4, [esi + ebx]	; pixLeftBottom
+	punpcklbw	mm5, [esi + ebx + 4] ; pixRightBottom
+	pmulhuw		mm4, mm0	; pixLB * leftWeight
+	pmulhuw		mm5, mm1	; pixRB * rightWeight
+
+	paddw		mm2, mm3	; pixLT + pixRT
+	paddw		mm4, mm5	; pixLB + pixRB
+	pmulhuw		mm2, [esp + ST_Q_wTop]	; * weightTop
+	pmulhuw		mm4, [esp + ST_Q_wBottom]	; * weightBottom
+
+	; add both temp results
+	paddw		mm2, mm4
+	; divide by 65025 using integer reciprocal: (*129 >> 7)
+	pmulhuw		mm2, mm6
+	psrlw		mm2, 7
+	; pack & store
+	packuswb	mm2, mm2
+	por			mm2, mm7	; | 0xff000000
+	movd		[edi], mm2	; store pixel as DWord
+	add		edi, 4
+; loopctr <= xmax?
+	inc		eax
+	cmp		eax, [ebp + PAR_xmax]
+	jle		.loop
+.exit:
+	emms	; Don't EVER forget to call EMMS!
+	add		esp, 4 + 32	; restore  stack pointer
+	pop		esi
+	pop		edi
+	pop		ebx
+	mov		esp, ebp
+	pop		ebp
+	ret




More information about the Haiku-commits mailing list