junjunnさんの描画高速化取り込み

遅くなりましたが junjunn さんの描画高速化の成果を trunk/mona に取り込みました。

  • memset高速化
  • memcpy高速化
  • 計算量を減らす


などの方法がとられています。素晴らしい。
以下 diff です。

Index: string.cpp
===================================================================
--- string.cpp	(リビジョン 4022)
+++ string.cpp	(リビジョン 4023)
@@ -66,6 +66,7 @@
     \author HigePon
     \date   create:2002/12/15 update:
 */
+#if 0 // original
 void *memset(void* buf, int value, size_t size) {
 
     char *p = (char*)buf;
@@ -77,7 +78,28 @@
     }
     return buf;
 }
+#else // junjunn
+inline void divide(int iNumerator,int iDenominator,uint32_t* piQuotient,uint32_t* piRemainder)
+{
+    *piQuotient = iNumerator / iDenominator;
+    *piRemainder = iNumerator - iDenominator * *piQuotient;
+}
 
+void *memset(void* pTo,int iValue,size_t nCount)
+{
+    uint32_t* pdwTo=(uint32_t*)pTo;
+    uint8_t byValue=iValue;
+    uint32_t dwValue4 = byValue + (byValue<<8) + (byValue<<16) + (byValue<<24);
+
+    uint32_t i4ByteCount,i1ByteCount;
+    divide(nCount,4,&i4ByteCount,&i1ByteCount);
+
+    uint32_t n;
+    for(n=0;n<i4ByteCount;n++)  {*pdwTo++ = dwValue4;}
+    uint8_t* pcTo=(uint8_t*)pdwTo;
+    for(n=0;n<i1ByteCount;n++)  {*pcTo++ = byValue;}
+}
+#endif
 /*!
     \brief strlen
 
@@ -142,19 +164,34 @@
     return (unsigned char)*str1 - (unsigned char)*str2;
 }
 
-void* memcpy(void* s1, void* s2, size_t size) {
+#if 1 // junjunn
+void* memcpy(void* pTo,void* cpFrom,size_t nCount)
+{
+    uint32_t i4ByteCount,i1ByteCount;
+    divide(nCount,4,&i4ByteCount,&i1ByteCount);
+
+//4バイト転送部分
+    asm volatile("movl %0, %%edi \n"
+                 "movl %1, %%esi \n"
+                 "movl %2, %%ecx \n"
+                 "cld           \n"
+                 "rep movsd   \n"
+                 :
+                 : "m"(pTo), "m"(cpFrom), "m"(i4ByteCount)
+                 : "edi", "esi", "ecx");
+//1バイト転送部分
+    asm volatile("movl %0, %%ecx \n"
+                 "rep movsb   \n"
+                 :
+                 : "m"(i1ByteCount)
+                 : "ecx");
 
+    return pTo;
+}
+#else
+void* memcpy(void* s1, void* s2, size_t size)
+{
     asm volatile("movl %0, %%edi \n"
                  "movl %1, %%esi \n"
                  "movl %2, %%ecx \n"
@@ -167,6 +204,8 @@
 
     return s1;
 }
+#endif
+
 
 char* strncpy(char* s1, const char* s2, size_t n) {
 
Index: VesaConsole.h
===================================================================
--- VesaConsole.h	(リビジョン 4022)
+++ VesaConsole.h	(リビジョン 4023)
@@ -87,6 +87,7 @@
         uint32_t vramAddress;
         uint16_t uint8_tsPerScanLine;
         uint16_t bitsPerPixel;
+        uint16_t bytesPerPixel;
     };
 
     VesaScreen screen;
Index: VesaConsole.cpp
===================================================================
--- VesaConsole.cpp	(リビジョン 4022)
+++ VesaConsole.cpp	(リビジョン 4023)
@@ -260,6 +260,7 @@
     vramAddress = info->physBasePtr;
     uint8_tsPerScanLine = info->uint8_tsPerScanLine;
     bitsPerPixel = info->bitsPerPixel;
+    bytesPerPixel = bitsPerPixel / 8;
 
     selectMethod(info);
 }
@@ -333,63 +334,18 @@
     }
 #else
     // enough speed at -O3
-    memset((void*)vramAddress, 0xff, w * h * (bitsPerPixel / 8));
+    memset((void*)vramAddress, 0xff, w * h * bytesPerPixel);
 #endif
 }
 
 void VesaConsole::VesaScreen::clearScreenBlack(int w, int h)
 {
-    memset((void*)vramAddress, 0x00, w * h * (bitsPerPixel / 8));
+    memset((void*)vramAddress, 0x00, w * h * bytesPerPixel);
 }