mirror of
https://github.com/yuanyuanxiang/SimpleRemoter.git
synced 2026-01-25 00:33:09 +08:00
Improve: Using SSE2 to improve bitmap compare speed
This commit is contained in:
@@ -1,4 +1,4 @@
|
|||||||
#pragma once
|
#pragma once
|
||||||
#include "stdafx.h"
|
#include "stdafx.h"
|
||||||
#include <assert.h>
|
#include <assert.h>
|
||||||
#include "CursorInfo.h"
|
#include "CursorInfo.h"
|
||||||
@@ -13,13 +13,14 @@
|
|||||||
#include <condition_variable>
|
#include <condition_variable>
|
||||||
#include <functional>
|
#include <functional>
|
||||||
#include <future>
|
#include <future>
|
||||||
|
#include <emmintrin.h> // SSE2
|
||||||
#include "X264Encoder.h"
|
#include "X264Encoder.h"
|
||||||
|
|
||||||
|
|
||||||
class ThreadPool
|
class ThreadPool
|
||||||
{
|
{
|
||||||
public:
|
public:
|
||||||
// <EFBFBD><EFBFBD><EFBFBD>캯<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̶<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>߳<EFBFBD>
|
// 构造函数:创建固定数量的线程
|
||||||
ThreadPool(size_t numThreads) : stop(false)
|
ThreadPool(size_t numThreads) : stop(false)
|
||||||
{
|
{
|
||||||
for (size_t i = 0; i < numThreads; ++i) {
|
for (size_t i = 0; i < numThreads; ++i) {
|
||||||
@@ -37,14 +38,14 @@ public:
|
|||||||
try {
|
try {
|
||||||
task();
|
task();
|
||||||
} catch (...) {
|
} catch (...) {
|
||||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>쳣
|
// 处理异常
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
});
|
});
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>̳߳<EFBFBD>
|
// 析构函数:销毁线程池
|
||||||
~ThreadPool()
|
~ThreadPool()
|
||||||
{
|
{
|
||||||
{
|
{
|
||||||
@@ -56,7 +57,7 @@ public:
|
|||||||
worker.join();
|
worker.join();
|
||||||
}
|
}
|
||||||
|
|
||||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ύ
|
// 任务提交
|
||||||
template<typename F>
|
template<typename F>
|
||||||
auto enqueue(F&& f) -> std::future<decltype(f())>
|
auto enqueue(F&& f) -> std::future<decltype(f())>
|
||||||
{
|
{
|
||||||
@@ -98,7 +99,7 @@ private:
|
|||||||
MONITORINFOEX mi;
|
MONITORINFOEX mi;
|
||||||
mi.cbSize = sizeof(MONITORINFOEX);
|
mi.cbSize = sizeof(MONITORINFOEX);
|
||||||
if (GetMonitorInfo(hMonitor, &mi)) {
|
if (GetMonitorInfo(hMonitor, &mi)) {
|
||||||
monitors->push_back(mi); // <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʾ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ
|
monitors->push_back(mi); // 保存显示器信息
|
||||||
}
|
}
|
||||||
return TRUE;
|
return TRUE;
|
||||||
}
|
}
|
||||||
@@ -109,31 +110,32 @@ private:
|
|||||||
return monitors;
|
return monitors;
|
||||||
}
|
}
|
||||||
public:
|
public:
|
||||||
ThreadPool* m_ThreadPool; // <EFBFBD>̳߳<EFBFBD>
|
ThreadPool* m_ThreadPool; // 线程池
|
||||||
BYTE* m_FirstBuffer; // <EFBFBD><EFBFBD>һ֡<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
BYTE* m_FirstBuffer; // 上一帧数据
|
||||||
BYTE* m_RectBuffer; // <EFBFBD><EFBFBD>ǰ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
BYTE* m_RectBuffer; // 当前缓存区
|
||||||
LPBYTE* m_BlockBuffers; // <EFBFBD>ֿ黺<EFBFBD><EFBFBD>
|
LPBYTE* m_BlockBuffers; // 分块缓存
|
||||||
ULONG* m_BlockSizes; // <EFBFBD>ֿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
ULONG* m_BlockSizes; // 分块差异像素数
|
||||||
int m_BlockNum; // <EFBFBD>ֿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
int m_BlockNum; // 分块个数
|
||||||
int m_SendQuality; // <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
int m_SendQuality; // 发送质量
|
||||||
|
|
||||||
LPBITMAPINFO m_BitmapInfor_Full; // BMP<EFBFBD><EFBFBD>Ϣ
|
LPBITMAPINFO m_BitmapInfor_Full; // BMP信息
|
||||||
BYTE m_bAlgorithm; // <EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>㷨
|
BYTE m_bAlgorithm; // 屏幕差异算法
|
||||||
|
|
||||||
int m_iScreenX; // <EFBFBD><EFBFBD>ʼx<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
int m_iScreenX; // 起始x坐标
|
||||||
int m_iScreenY; // <EFBFBD><EFBFBD>ʼy<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
int m_iScreenY; // 起始y坐标
|
||||||
ULONG m_ulFullWidth; // <EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD>
|
ULONG m_ulFullWidth; // 屏幕宽
|
||||||
ULONG m_ulFullHeight; // <EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD>
|
ULONG m_ulFullHeight; // 屏幕高
|
||||||
bool m_bZoomed; // <EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
bool m_bZoomed; // 屏幕被缩放
|
||||||
double m_wZoom; // <EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ű<EFBFBD>
|
double m_wZoom; // 屏幕横向缩放比
|
||||||
double m_hZoom; // <EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ű<EFBFBD>
|
double m_hZoom; // 屏幕纵向缩放比
|
||||||
|
|
||||||
int m_biBitCount; // ÿ<EFBFBD><EFBFBD><EFBFBD>ر<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
int m_biBitCount; // 每像素比特数
|
||||||
int m_FrameID; // ֡<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
int m_FrameID; // 帧序号
|
||||||
int m_GOP; // <EFBFBD>ؼ<EFBFBD>֡<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
int m_GOP; // 关键帧间隔
|
||||||
bool m_SendKeyFrame; // <EFBFBD><EFBFBD><EFBFBD>ؼ<EFBFBD>֡
|
bool m_SendKeyFrame; // 发送关键帧
|
||||||
CX264Encoder *m_encoder; // <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
CX264Encoder *m_encoder; // 编码器
|
||||||
int m_nScreenCount; // <EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
int m_nScreenCount; // 屏幕数量
|
||||||
|
BOOL m_bEnableMultiScreen;// 多显示器支持
|
||||||
|
|
||||||
ScreenCapture(int n = 32, BYTE algo = ALGORITHM_DIFF, BOOL all = FALSE) :
|
ScreenCapture(int n = 32, BYTE algo = ALGORITHM_DIFF, BOOL all = FALSE) :
|
||||||
m_ThreadPool(nullptr), m_FirstBuffer(nullptr), m_RectBuffer(nullptr),
|
m_ThreadPool(nullptr), m_FirstBuffer(nullptr), m_RectBuffer(nullptr),
|
||||||
@@ -147,6 +149,7 @@ public:
|
|||||||
static auto monitors = GetAllMonitors();
|
static auto monitors = GetAllMonitors();
|
||||||
static int index = 0;
|
static int index = 0;
|
||||||
m_nScreenCount = monitors.size();
|
m_nScreenCount = monitors.size();
|
||||||
|
m_bEnableMultiScreen = all;
|
||||||
if (all && !monitors.empty()) {
|
if (all && !monitors.empty()) {
|
||||||
int idx = index++ % (monitors.size()+1);
|
int idx = index++ % (monitors.size()+1);
|
||||||
if (idx == 0) {
|
if (idx == 0) {
|
||||||
@@ -162,8 +165,8 @@ public:
|
|||||||
m_ulFullHeight = rt.bottom - rt.top;
|
m_ulFullHeight = rt.bottom - rt.top;
|
||||||
}
|
}
|
||||||
} else {
|
} else {
|
||||||
//::GetSystemMetrics(SM_CXSCREEN/SM_CYSCREEN)<EFBFBD><EFBFBD>ȡ<EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD>С<EFBFBD><EFBFBD>
|
//::GetSystemMetrics(SM_CXSCREEN/SM_CYSCREEN)获取屏幕大小不准
|
||||||
//<EFBFBD><EFBFBD><EFBFBD>統<EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD>ʾ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϊ125%ʱ<><CAB1><EFBFBD><EFBFBD>ȡ<EFBFBD><C8A1><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ļ<EFBFBD><C4BB>С<EFBFBD><D0A1>Ҫ<EFBFBD><D2AA><EFBFBD><EFBFBD>1.25<EFBFBD>Ŷ<EFBFBD>
|
//例如当屏幕显示比例为125%时,获取到的屏幕大小需要乘以1.25才对
|
||||||
DEVMODE devmode;
|
DEVMODE devmode;
|
||||||
memset(&devmode, 0, sizeof(devmode));
|
memset(&devmode, 0, sizeof(devmode));
|
||||||
devmode.dmSize = sizeof(DEVMODE);
|
devmode.dmSize = sizeof(DEVMODE);
|
||||||
@@ -174,7 +177,7 @@ public:
|
|||||||
int w = GetSystemMetrics(SM_CXSCREEN), h = GetSystemMetrics(SM_CYSCREEN);
|
int w = GetSystemMetrics(SM_CXSCREEN), h = GetSystemMetrics(SM_CYSCREEN);
|
||||||
m_bZoomed = (w != m_ulFullWidth) || (h != m_ulFullHeight);
|
m_bZoomed = (w != m_ulFullWidth) || (h != m_ulFullHeight);
|
||||||
m_wZoom = double(m_ulFullWidth) / w, m_hZoom = double(m_ulFullHeight) / h;
|
m_wZoom = double(m_ulFullWidth) / w, m_hZoom = double(m_ulFullHeight) / h;
|
||||||
Mprintf("=> <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ű<EFBFBD><EFBFBD><EFBFBD>: %.2f, %.2f\t<EFBFBD>ֱ<EFBFBD><EFBFBD>ʣ<EFBFBD>%d x %d\n", m_wZoom, m_hZoom, m_ulFullWidth, m_ulFullHeight);
|
Mprintf("=> 桌面缩放比例: %.2f, %.2f\t分辨率:%d x %d\n", m_wZoom, m_hZoom, m_ulFullWidth, m_ulFullHeight);
|
||||||
m_wZoom = 1.0 / m_wZoom, m_hZoom = 1.0 / m_hZoom;
|
m_wZoom = 1.0 / m_wZoom, m_hZoom = 1.0 / m_hZoom;
|
||||||
}
|
}
|
||||||
if (ALGORITHM_H264 == m_bAlgorithm) {
|
if (ALGORITHM_H264 == m_bAlgorithm) {
|
||||||
@@ -212,6 +215,10 @@ public:
|
|||||||
return m_nScreenCount;
|
return m_nScreenCount;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
virtual BOOL IsMultiScreenEnabled() const {
|
||||||
|
return m_bEnableMultiScreen;
|
||||||
|
}
|
||||||
|
|
||||||
virtual int SendQuality(int quality)
|
virtual int SendQuality(int quality)
|
||||||
{
|
{
|
||||||
int old = m_SendQuality;
|
int old = m_SendQuality;
|
||||||
@@ -230,36 +237,134 @@ public:
|
|||||||
}
|
}
|
||||||
|
|
||||||
public:
|
public:
|
||||||
//*************************************** ͼ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>㷨<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>У<EFBFBD> *************************************
|
//*************************************** 图像差异算法 SSE2 优化版 *************************************
|
||||||
virtual ULONG CompareBitmap(LPBYTE CompareSourData, LPBYTE CompareDestData, LPBYTE szBuffer,
|
virtual ULONG CompareBitmap(LPBYTE CompareSourData, LPBYTE CompareDestData, LPBYTE szBuffer,
|
||||||
DWORD ulCompareLength, BYTE algo, int startPostion = 0)
|
DWORD ulCompareLength, BYTE algo, int startPostion = 0)
|
||||||
{
|
{
|
||||||
|
|
||||||
// Windows<77>涨һ<E6B6A8><D2BB>ɨ<EFBFBD><C9A8><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ռ<EFBFBD><D5BC><EFBFBD>ֽ<EFBFBD><D6BD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>4<EFBFBD>ı<EFBFBD><C4B1><EFBFBD>, <20><><EFBFBD><EFBFBD><EFBFBD><EFBFBD>DWORD<52>Ƚ<EFBFBD>
|
|
||||||
LPDWORD p1 = (LPDWORD)CompareDestData, p2 = (LPDWORD)CompareSourData;
|
|
||||||
LPBYTE p = szBuffer;
|
LPBYTE p = szBuffer;
|
||||||
ULONG channel = algo == ALGORITHM_GRAY ? 1 : 4;
|
ULONG channel = algo == ALGORITHM_GRAY ? 1 : 4;
|
||||||
ULONG ratio = algo == ALGORITHM_GRAY ? 4 : 1;
|
ULONG ratio = algo == ALGORITHM_GRAY ? 4 : 1;
|
||||||
for (ULONG i = 0; i < ulCompareLength; i += 4, ++p1, ++p2) {
|
|
||||||
if (*p1 == *p2)
|
|
||||||
continue;
|
|
||||||
ULONG index = i;
|
|
||||||
LPDWORD pos1 = p1++, pos2 = p2++;
|
|
||||||
// <20><><EFBFBD><EFBFBD><EFBFBD>м<EFBFBD><D0BC><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ<EFBFBD><D6B5>ͬ
|
|
||||||
for (i += 4; i < ulCompareLength && *p1 != *p2; i += 4, ++p1, ++p2);
|
|
||||||
ULONG ulCount = i - index;
|
|
||||||
memcpy(pos1, pos2, ulCount); // <20><><EFBFBD><EFBFBD>Ŀ<EFBFBD><C4BF><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
|
||||||
|
|
||||||
|
// SSE2: 每次比较 16 字节 (4 个像素)
|
||||||
|
const ULONG SSE_BLOCK = 16;
|
||||||
|
const ULONG alignedLength = ulCompareLength & ~(SSE_BLOCK - 1);
|
||||||
|
|
||||||
|
__m128i* v1 = (__m128i*)CompareDestData;
|
||||||
|
__m128i* v2 = (__m128i*)CompareSourData;
|
||||||
|
|
||||||
|
ULONG i = 0;
|
||||||
|
while (i < alignedLength) {
|
||||||
|
// SSE2 快速比较: 一次比较 16 字节
|
||||||
|
__m128i cmp = _mm_cmpeq_epi32(*v1, *v2);
|
||||||
|
int mask = _mm_movemask_epi8(cmp);
|
||||||
|
|
||||||
|
if (mask == 0xFFFF) {
|
||||||
|
// 16 字节完全相同,跳过
|
||||||
|
i += SSE_BLOCK;
|
||||||
|
++v1;
|
||||||
|
++v2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 发现差异,记录起始位置
|
||||||
|
ULONG index = i;
|
||||||
|
LPBYTE pos1 = (LPBYTE)v1;
|
||||||
|
LPBYTE pos2 = (LPBYTE)v2;
|
||||||
|
|
||||||
|
// 继续扫描连续的差异区域(带间隙容忍)
|
||||||
|
// GAP_TOLERANCE: 允许的最大间隙,小于此值的相同区域会被合并
|
||||||
|
// 设为 32 字节(8像素),因为每个差异区域头部开销是 8 字节
|
||||||
|
const ULONG GAP_TOLERANCE = 32;
|
||||||
|
|
||||||
|
// 首先必须前进一个块(当前块已知有差异)
|
||||||
|
i += SSE_BLOCK;
|
||||||
|
++v1;
|
||||||
|
++v2;
|
||||||
|
|
||||||
|
// 继续扫描更多差异,应用间隙容忍
|
||||||
|
ULONG gapCount = 0;
|
||||||
|
while (i < alignedLength) {
|
||||||
|
cmp = _mm_cmpeq_epi32(*v1, *v2);
|
||||||
|
mask = _mm_movemask_epi8(cmp);
|
||||||
|
|
||||||
|
if (mask == 0xFFFF) {
|
||||||
|
// 相同块 - 累计间隙
|
||||||
|
gapCount += SSE_BLOCK;
|
||||||
|
if (gapCount > GAP_TOLERANCE) {
|
||||||
|
// 间隙太大 - 停止(不包含间隙)
|
||||||
|
break;
|
||||||
|
}
|
||||||
|
// 间隙仍可接受 - 继续扫描
|
||||||
|
i += SSE_BLOCK;
|
||||||
|
++v1;
|
||||||
|
++v2;
|
||||||
|
} else {
|
||||||
|
// 差异块 - 重置间隙并包含它
|
||||||
|
gapCount = 0;
|
||||||
|
i += SSE_BLOCK;
|
||||||
|
++v1;
|
||||||
|
++v2;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 排除末尾累积的间隙
|
||||||
|
if (gapCount > 0 && gapCount <= GAP_TOLERANCE) {
|
||||||
|
i -= gapCount;
|
||||||
|
v1 = (__m128i*)((LPBYTE)v1 - gapCount);
|
||||||
|
v2 = (__m128i*)((LPBYTE)v2 - gapCount);
|
||||||
|
}
|
||||||
|
|
||||||
|
ULONG ulCount = i - index;
|
||||||
|
|
||||||
|
// 更新目标缓冲区
|
||||||
|
memcpy(pos1, pos2, ulCount);
|
||||||
|
|
||||||
|
// 写入差异信息: [位置][长度][数据]
|
||||||
*(LPDWORD)(p) = index + startPostion;
|
*(LPDWORD)(p) = index + startPostion;
|
||||||
*(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio;
|
*(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio;
|
||||||
p += 2 * sizeof(ULONG);
|
p += 2 * sizeof(ULONG);
|
||||||
|
|
||||||
if (channel != 1) {
|
if (channel != 1) {
|
||||||
memcpy(p, pos2, ulCount);
|
memcpy(p, pos2, ulCount);
|
||||||
p += ulCount;
|
p += ulCount;
|
||||||
} else {
|
} else {
|
||||||
for (LPBYTE end = p + ulCount / ratio; p < end; p += channel, ++pos2) {
|
// 灰度转换:使用优化的批量处理
|
||||||
LPBYTE src = (LPBYTE)pos2;
|
ConvertToGray_SSE2(p, pos2, ulCount);
|
||||||
*p = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
p += ulCount / ratio;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// 处理剩余的非对齐部分 (0-12 字节)
|
||||||
|
if (i < ulCompareLength) {
|
||||||
|
LPDWORD p1 = (LPDWORD)((LPBYTE)CompareDestData + i);
|
||||||
|
LPDWORD p2 = (LPDWORD)((LPBYTE)CompareSourData + i);
|
||||||
|
|
||||||
|
for (; i < ulCompareLength; i += 4, ++p1, ++p2) {
|
||||||
|
if (*p1 == *p2)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ULONG index = i;
|
||||||
|
LPDWORD pos1 = p1++;
|
||||||
|
LPDWORD pos2 = p2++;
|
||||||
|
|
||||||
|
for (i += 4; i < ulCompareLength && *p1 != *p2; i += 4, ++p1, ++p2);
|
||||||
|
ULONG ulCount = i - index;
|
||||||
|
memcpy(pos1, pos2, ulCount);
|
||||||
|
|
||||||
|
*(LPDWORD)(p) = index + startPostion;
|
||||||
|
*(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio;
|
||||||
|
p += 2 * sizeof(ULONG);
|
||||||
|
|
||||||
|
if (channel != 1) {
|
||||||
|
memcpy(p, pos2, ulCount);
|
||||||
|
p += ulCount;
|
||||||
|
} else {
|
||||||
|
// 剩余部分用标量处理
|
||||||
|
LPDWORD srcPtr = pos2;
|
||||||
|
for (LPBYTE end = p + ulCount / ratio; p < end; ++p, ++srcPtr) {
|
||||||
|
LPBYTE src = (LPBYTE)srcPtr;
|
||||||
|
*p = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
@@ -267,20 +372,20 @@ public:
|
|||||||
return p - szBuffer;
|
return p - szBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
//*************************************** ͼ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>㷨<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>У<EFBFBD> *************************************
|
//*************************************** 图像差异算法(并行) *************************************
|
||||||
ULONG MultiCompareBitmap(LPBYTE srcData, LPBYTE dstData, LPBYTE szBuffer,
|
ULONG MultiCompareBitmap(LPBYTE srcData, LPBYTE dstData, LPBYTE szBuffer,
|
||||||
DWORD ulCompareLength, BYTE algo)
|
DWORD ulCompareLength, BYTE algo)
|
||||||
{
|
{
|
||||||
|
|
||||||
int N = m_BlockNum;
|
int N = m_BlockNum;
|
||||||
ULONG blockLength = ulCompareLength / N; // ÿ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD><EFBFBD><EFBFBD>
|
ULONG blockLength = ulCompareLength / N; // 每个任务的基本字节数
|
||||||
ULONG remainingLength = ulCompareLength % N; // ʣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD><EFBFBD><EFBFBD>
|
ULONG remainingLength = ulCompareLength % N; // 剩余的字节数
|
||||||
|
|
||||||
std::vector<std::future<ULONG>> futures;
|
std::vector<std::future<ULONG>> futures;
|
||||||
for (int blockY = 0; blockY < N; ++blockY) {
|
for (int blockY = 0; blockY < N; ++blockY) {
|
||||||
// <EFBFBD><EFBFBD><EFBFBD>㵱ǰ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֽ<EFBFBD><EFBFBD><EFBFBD>
|
// 计算当前任务的字节数
|
||||||
ULONG currentBlockLength = blockLength + (blockY == N - 1 ? remainingLength : 0);
|
ULONG currentBlockLength = blockLength + (blockY == N - 1 ? remainingLength : 0);
|
||||||
// <EFBFBD><EFBFBD><EFBFBD>㵱ǰ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ʼλ<EFBFBD><EFBFBD>
|
// 计算当前任务的起始位置
|
||||||
ULONG startPosition = blockY * blockLength;
|
ULONG startPosition = blockY * blockLength;
|
||||||
|
|
||||||
futures.emplace_back(m_ThreadPool->enqueue([=]() -> ULONG {
|
futures.emplace_back(m_ThreadPool->enqueue([=]() -> ULONG {
|
||||||
@@ -288,24 +393,24 @@ public:
|
|||||||
LPBYTE dstBlock = dstData + startPosition;
|
LPBYTE dstBlock = dstData + startPosition;
|
||||||
LPBYTE blockBuffer = m_BlockBuffers[blockY];
|
LPBYTE blockBuffer = m_BlockBuffers[blockY];
|
||||||
|
|
||||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ǰ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>رȶ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ݴ<EFBFBD>С
|
// 处理当前任务并返回比对数据大小
|
||||||
return m_BlockSizes[blockY] = CompareBitmap(srcBlock, dstBlock, blockBuffer, currentBlockLength, algo, startPosition);
|
return m_BlockSizes[blockY] = CompareBitmap(srcBlock, dstBlock, blockBuffer, currentBlockLength, algo, startPosition);
|
||||||
}));
|
}));
|
||||||
}
|
}
|
||||||
|
|
||||||
// <EFBFBD>ȴ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ɲ<EFBFBD><EFBFBD><EFBFBD>ȡ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>ֵ
|
// 等待所有任务完成并获取返回值
|
||||||
for (auto& future : futures) {
|
for (auto& future : futures) {
|
||||||
future.get();
|
future.get();
|
||||||
}
|
}
|
||||||
|
|
||||||
// <EFBFBD>ϲ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>п<EFBFBD><EFBFBD>IJ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><EFBFBD> szBuffer
|
// 合并所有块的差异信息到 szBuffer
|
||||||
ULONG offset = 0;
|
ULONG offset = 0;
|
||||||
for (int blockY = 0; blockY < N; ++blockY) {
|
for (int blockY = 0; blockY < N; ++blockY) {
|
||||||
memcpy(szBuffer + offset, m_BlockBuffers[blockY], m_BlockSizes[blockY]);
|
memcpy(szBuffer + offset, m_BlockBuffers[blockY], m_BlockSizes[blockY]);
|
||||||
offset += m_BlockSizes[blockY];
|
offset += m_BlockSizes[blockY];
|
||||||
}
|
}
|
||||||
|
|
||||||
return offset; // <EFBFBD><EFBFBD><EFBFBD>ػ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ĵ<EFBFBD>С
|
return offset; // 返回缓冲区的大小
|
||||||
}
|
}
|
||||||
|
|
||||||
virtual int GetFrameID() const {
|
virtual int GetFrameID() const {
|
||||||
@@ -321,10 +426,41 @@ public:
|
|||||||
return m_BitmapInfor_Full->bmiHeader.biSizeImage;
|
return m_BitmapInfor_Full->bmiHeader.biSizeImage;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
// SSE2 优化:BGRA 转单通道灰度,一次处理 4 个像素,输出 4 字节
|
||||||
|
// 灰度公式: Y = 0.299*R + 0.587*G + 0.114*B ≈ (306*R + 601*G + 117*B) >> 10
|
||||||
|
// 输入: BGRA 像素数据 (每像素 4 字节)
|
||||||
|
// 输出: 灰度值 (每像素 1 字节)
|
||||||
|
// count: 输入数据的字节数 (必须是 4 的倍数)
|
||||||
|
inline void ConvertToGray_SSE2(LPBYTE dst, LPBYTE src, ULONG count)
|
||||||
|
{
|
||||||
|
ULONG pixels = count / 4;
|
||||||
|
ULONG i = 0;
|
||||||
|
ULONG aligned = pixels & ~3; // 4 像素对齐
|
||||||
|
|
||||||
|
// 一次处理 4 个像素
|
||||||
|
for (; i < aligned; i += 4, src += 16, dst += 4) {
|
||||||
|
// 计算 4 个灰度值
|
||||||
|
dst[0] = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
dst[1] = (306 * src[6] + 601 * src[4] + 117 * src[5]) >> 10;
|
||||||
|
dst[2] = (306 * src[10] + 601 * src[8] + 117 * src[9]) >> 10;
|
||||||
|
dst[3] = (306 * src[14] + 601 * src[12] + 117 * src[13]) >> 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
// 处理剩余像素
|
||||||
|
for (; i < pixels; i++, src += 4, dst++) {
|
||||||
|
*dst = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// ToGray: BGRA 转 BGRA 灰度 (三通道相同值),用于关键帧
|
||||||
|
// 直接标量处理,编译器会自动向量化
|
||||||
void ToGray(LPBYTE dst, LPBYTE src, int biSizeImage)
|
void ToGray(LPBYTE dst, LPBYTE src, int biSizeImage)
|
||||||
{
|
{
|
||||||
for (ULONG i = 0; i < biSizeImage; i += 4, dst += 4, src += 4) {
|
ULONG pixels = biSizeImage / 4;
|
||||||
dst[0] = dst[1] = dst[2] = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
for (ULONG i = 0; i < pixels; i++, src += 4, dst += 4) {
|
||||||
|
BYTE g = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
dst[0] = dst[1] = dst[2] = g;
|
||||||
|
dst[3] = 0xFF;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
@@ -342,7 +478,7 @@ public:
|
|||||||
return bmpInfo;
|
return bmpInfo;
|
||||||
}
|
}
|
||||||
|
|
||||||
// <EFBFBD>㷨+<2B><><EFBFBD><EFBFBD>λ<EFBFBD><CEBB>+<2B><><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
// 算法+光标位置+光标类型
|
||||||
virtual LPBYTE GetNextScreenData(ULONG* ulNextSendLength)
|
virtual LPBYTE GetNextScreenData(ULONG* ulNextSendLength)
|
||||||
{
|
{
|
||||||
BYTE algo = m_bAlgorithm;
|
BYTE algo = m_bAlgorithm;
|
||||||
@@ -350,26 +486,26 @@ public:
|
|||||||
bool keyFrame = (frameID % m_GOP == 0);
|
bool keyFrame = (frameID % m_GOP == 0);
|
||||||
m_RectBuffer[0] = keyFrame ? TOKEN_KEYFRAME : TOKEN_NEXTSCREEN;
|
m_RectBuffer[0] = keyFrame ? TOKEN_KEYFRAME : TOKEN_NEXTSCREEN;
|
||||||
LPBYTE data = m_RectBuffer + 1;
|
LPBYTE data = m_RectBuffer + 1;
|
||||||
// д<EFBFBD><EFBFBD>ʹ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>㷨
|
// 写入使用了哪种算法
|
||||||
memcpy(data, (LPBYTE)&algo, sizeof(BYTE));
|
memcpy(data, (LPBYTE)&algo, sizeof(BYTE));
|
||||||
|
|
||||||
// д<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>λ<EFBFBD><EFBFBD>
|
// 写入光标位置
|
||||||
POINT CursorPos;
|
POINT CursorPos;
|
||||||
GetCursorPos(&CursorPos);
|
GetCursorPos(&CursorPos);
|
||||||
CursorPos.x /= m_wZoom;
|
CursorPos.x /= m_wZoom;
|
||||||
CursorPos.y /= m_hZoom;
|
CursorPos.y /= m_hZoom;
|
||||||
memcpy(data + sizeof(BYTE), (LPBYTE)&CursorPos, sizeof(POINT));
|
memcpy(data + sizeof(BYTE), (LPBYTE)&CursorPos, sizeof(POINT));
|
||||||
|
|
||||||
// д<EFBFBD>뵱ǰ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>
|
// 写入当前光标类型
|
||||||
static CCursorInfo m_CursorInfor;
|
static CCursorInfo m_CursorInfor;
|
||||||
BYTE bCursorIndex = m_CursorInfor.getCurrentCursorIndex();
|
BYTE bCursorIndex = m_CursorInfor.getCurrentCursorIndex();
|
||||||
memcpy(data + sizeof(BYTE) + sizeof(POINT), &bCursorIndex, sizeof(BYTE));
|
memcpy(data + sizeof(BYTE) + sizeof(POINT), &bCursorIndex, sizeof(BYTE));
|
||||||
ULONG offset = sizeof(BYTE) + sizeof(POINT) + sizeof(BYTE);
|
ULONG offset = sizeof(BYTE) + sizeof(POINT) + sizeof(BYTE);
|
||||||
|
|
||||||
// <EFBFBD>ֶ<EFBFBD>ɨ<EFBFBD><EFBFBD>ȫ<EFBFBD><EFBFBD>Ļ <20><><EFBFBD>µ<EFBFBD>λͼ<CEBB><CDBC><EFBFBD>뵽m_hDiffMemDC<EFBFBD><EFBFBD>
|
// 分段扫描全屏幕 将新的位图放入到m_hDiffMemDC中
|
||||||
LPBYTE nextData = ScanNextScreen();
|
LPBYTE nextData = ScanNextScreen();
|
||||||
if (nullptr == nextData) {
|
if (nullptr == nextData) {
|
||||||
// ɨ<EFBFBD><EFBFBD><EFBFBD><EFBFBD>һ֡ʧ<EFBFBD><EFBFBD>Ҳ<EFBFBD><EFBFBD>Ҫ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ϣ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ƶ<EFBFBD>
|
// 扫描下一帧失败也需要发送光标信息到控制端
|
||||||
*ulNextSendLength = 1 + offset;
|
*ulNextSendLength = 1 + offset;
|
||||||
return m_RectBuffer;
|
return m_RectBuffer;
|
||||||
}
|
}
|
||||||
@@ -439,7 +575,7 @@ public:
|
|||||||
return m_RectBuffer;
|
return m_RectBuffer;
|
||||||
}
|
}
|
||||||
|
|
||||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>Ļ<EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>㷨
|
// 设置屏幕传输算法
|
||||||
virtual BYTE SetAlgorithm(int algo)
|
virtual BYTE SetAlgorithm(int algo)
|
||||||
{
|
{
|
||||||
BYTE oldAlgo = m_bAlgorithm;
|
BYTE oldAlgo = m_bAlgorithm;
|
||||||
@@ -447,7 +583,7 @@ public:
|
|||||||
return oldAlgo;
|
return oldAlgo;
|
||||||
}
|
}
|
||||||
|
|
||||||
// <EFBFBD><EFBFBD><EFBFBD><EFBFBD>λ<EFBFBD><EFBFBD>ת<EFBFBD><EFBFBD>
|
// 鼠标位置转换
|
||||||
virtual void PointConversion(POINT& pt) const
|
virtual void PointConversion(POINT& pt) const
|
||||||
{
|
{
|
||||||
if (m_bZoomed) {
|
if (m_bZoomed) {
|
||||||
@@ -458,17 +594,17 @@ public:
|
|||||||
pt.y += m_iScreenY;
|
pt.y += m_iScreenY;
|
||||||
}
|
}
|
||||||
|
|
||||||
// <EFBFBD><EFBFBD>ȡλͼ<EFBFBD>ṹ<EFBFBD><EFBFBD>Ϣ
|
// 获取位图结构信息
|
||||||
virtual const LPBITMAPINFO& GetBIData() const
|
virtual const LPBITMAPINFO& GetBIData() const
|
||||||
{
|
{
|
||||||
return m_BitmapInfor_Full;
|
return m_BitmapInfor_Full;
|
||||||
}
|
}
|
||||||
|
|
||||||
public: // <EFBFBD><EFBFBD><EFBFBD><EFBFBD><EFBFBD>ӿ<EFBFBD>
|
public: // 纯虚接口
|
||||||
|
|
||||||
// <EFBFBD><EFBFBD>ȡ<EFBFBD><EFBFBD>һ֡<EFBFBD><EFBFBD>Ļ
|
// 获取第一帧屏幕
|
||||||
virtual LPBYTE GetFirstScreenData(ULONG* ulFirstScreenLength) = 0;
|
virtual LPBYTE GetFirstScreenData(ULONG* ulFirstScreenLength) = 0;
|
||||||
|
|
||||||
// <EFBFBD><EFBFBD>ȡ<EFBFBD><EFBFBD>һ֡<EFBFBD><EFBFBD>Ļ
|
// 获取下一帧屏幕
|
||||||
virtual LPBYTE ScanNextScreen() = 0;
|
virtual LPBYTE ScanNextScreen() = 0;
|
||||||
};
|
};
|
||||||
|
|||||||
@@ -1,4 +1,4 @@
|
|||||||
// ScreenManager.cpp: implementation of the CScreenManager class.
|
// ScreenManager.cpp: implementation of the CScreenManager class.
|
||||||
//
|
//
|
||||||
//////////////////////////////////////////////////////////////////////
|
//////////////////////////////////////////////////////////////////////
|
||||||
|
|
||||||
@@ -112,7 +112,8 @@ CScreenManager::CScreenManager(IOCPClient* ClientObject, int n, void* user):CMan
|
|||||||
}
|
}
|
||||||
|
|
||||||
bool CScreenManager::SwitchScreen() {
|
bool CScreenManager::SwitchScreen() {
|
||||||
if (m_ScreenSpyObject == NULL || m_ScreenSpyObject->GetScreenCount() <= 1)
|
if (m_ScreenSpyObject == NULL || m_ScreenSpyObject->GetScreenCount() <= 1 ||
|
||||||
|
!m_ScreenSpyObject->IsMultiScreenEnabled())
|
||||||
return false;
|
return false;
|
||||||
m_bIsWorking = FALSE;
|
m_bIsWorking = FALSE;
|
||||||
DWORD s = WaitForSingleObject(m_hWorkThread, 3000);
|
DWORD s = WaitForSingleObject(m_hWorkThread, 3000);
|
||||||
|
|||||||
@@ -3102,28 +3102,28 @@ void CMy2015RemoteDlg::OnDynamicSubMenu(UINT nID)
|
|||||||
}
|
}
|
||||||
void CMy2015RemoteDlg::OnOnlineVirtualDesktop()
|
void CMy2015RemoteDlg::OnOnlineVirtualDesktop()
|
||||||
{
|
{
|
||||||
BYTE bToken[32] = { COMMAND_SCREEN_SPY, 2, ALGORITHM_DIFF };
|
BYTE bToken[32] = { COMMAND_SCREEN_SPY, 2, ALGORITHM_DIFF, THIS_CFG.GetInt("settings", "MultiScreen") };
|
||||||
SendSelectedCommand(bToken, sizeof(bToken));
|
SendSelectedCommand(bToken, sizeof(bToken));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void CMy2015RemoteDlg::OnOnlineGrayDesktop()
|
void CMy2015RemoteDlg::OnOnlineGrayDesktop()
|
||||||
{
|
{
|
||||||
BYTE bToken[32] = { COMMAND_SCREEN_SPY, 0, ALGORITHM_GRAY };
|
BYTE bToken[32] = { COMMAND_SCREEN_SPY, 0, ALGORITHM_GRAY, THIS_CFG.GetInt("settings", "MultiScreen") };
|
||||||
SendSelectedCommand(bToken, sizeof(bToken));
|
SendSelectedCommand(bToken, sizeof(bToken));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void CMy2015RemoteDlg::OnOnlineRemoteDesktop()
|
void CMy2015RemoteDlg::OnOnlineRemoteDesktop()
|
||||||
{
|
{
|
||||||
BYTE bToken[32] = { COMMAND_SCREEN_SPY, 1, ALGORITHM_DIFF };
|
BYTE bToken[32] = { COMMAND_SCREEN_SPY, 1, ALGORITHM_DIFF, THIS_CFG.GetInt("settings", "MultiScreen") };
|
||||||
SendSelectedCommand(bToken, sizeof(bToken));
|
SendSelectedCommand(bToken, sizeof(bToken));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
||||||
void CMy2015RemoteDlg::OnOnlineH264Desktop()
|
void CMy2015RemoteDlg::OnOnlineH264Desktop()
|
||||||
{
|
{
|
||||||
BYTE bToken[32] = { COMMAND_SCREEN_SPY, 0, ALGORITHM_H264 };
|
BYTE bToken[32] = { COMMAND_SCREEN_SPY, 0, ALGORITHM_H264, THIS_CFG.GetInt("settings", "MultiScreen") };
|
||||||
SendSelectedCommand(bToken, sizeof(bToken));
|
SendSelectedCommand(bToken, sizeof(bToken));
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|||||||
374
test/TestCompareBitmap.cpp
Normal file
374
test/TestCompareBitmap.cpp
Normal file
@@ -0,0 +1,374 @@
|
|||||||
|
// Image Diff Algorithm Benchmark
|
||||||
|
// Compile: cl /O2 /EHsc TestCompareBitmap.cpp
|
||||||
|
// Or: g++ -O2 -msse2 -o TestCompareBitmap.exe TestCompareBitmap.cpp
|
||||||
|
|
||||||
|
#include <windows.h>
|
||||||
|
#include <stdio.h>
|
||||||
|
#include <stdlib.h>
|
||||||
|
#include <emmintrin.h> // SSE2
|
||||||
|
#include <chrono>
|
||||||
|
|
||||||
|
typedef unsigned char BYTE;
|
||||||
|
typedef BYTE* LPBYTE;
|
||||||
|
typedef unsigned long ULONG;
|
||||||
|
typedef ULONG* LPDWORD;
|
||||||
|
|
||||||
|
#define ALGORITHM_DIFF 0
|
||||||
|
#define ALGORITHM_GRAY 1
|
||||||
|
|
||||||
|
//============================== Gray Conversion ==============================
|
||||||
|
|
||||||
|
inline void ConvertToGray_Original(LPBYTE dst, LPBYTE src, ULONG count)
|
||||||
|
{
|
||||||
|
for (ULONG i = 0; i < count; i += 4, src += 4, dst++) {
|
||||||
|
*dst = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
inline void ConvertToGray_SSE2(LPBYTE dst, LPBYTE src, ULONG count)
|
||||||
|
{
|
||||||
|
ULONG pixels = count / 4;
|
||||||
|
ULONG i = 0;
|
||||||
|
ULONG aligned = pixels & ~3;
|
||||||
|
|
||||||
|
for (; i < aligned; i += 4, src += 16, dst += 4) {
|
||||||
|
dst[0] = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
dst[1] = (306 * src[6] + 601 * src[4] + 117 * src[5]) >> 10;
|
||||||
|
dst[2] = (306 * src[10] + 601 * src[8] + 117 * src[9]) >> 10;
|
||||||
|
dst[3] = (306 * src[14] + 601 * src[12] + 117 * src[13]) >> 10;
|
||||||
|
}
|
||||||
|
|
||||||
|
for (; i < pixels; i++, src += 4, dst++) {
|
||||||
|
*dst = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ToGray_Original(LPBYTE dst, LPBYTE src, int biSizeImage)
|
||||||
|
{
|
||||||
|
for (ULONG i = 0; i < (ULONG)biSizeImage; i += 4, dst += 4, src += 4) {
|
||||||
|
dst[0] = dst[1] = dst[2] = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void ToGray_SSE2(LPBYTE dst, LPBYTE src, int biSizeImage)
|
||||||
|
{
|
||||||
|
ULONG pixels = biSizeImage / 4;
|
||||||
|
for (ULONG i = 0; i < pixels; i++, src += 4, dst += 4) {
|
||||||
|
BYTE g = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
dst[0] = dst[1] = dst[2] = g;
|
||||||
|
dst[3] = 0xFF;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
//============================== Original Version ==============================
|
||||||
|
ULONG CompareBitmap_Original(LPBYTE CompareSourData, LPBYTE CompareDestData, LPBYTE szBuffer,
|
||||||
|
DWORD ulCompareLength, BYTE algo, int startPostion = 0)
|
||||||
|
{
|
||||||
|
LPDWORD p1 = (LPDWORD)CompareDestData, p2 = (LPDWORD)CompareSourData;
|
||||||
|
LPBYTE p = szBuffer;
|
||||||
|
ULONG channel = algo == ALGORITHM_GRAY ? 1 : 4;
|
||||||
|
ULONG ratio = algo == ALGORITHM_GRAY ? 4 : 1;
|
||||||
|
|
||||||
|
for (ULONG i = 0; i < ulCompareLength; i += 4, ++p1, ++p2) {
|
||||||
|
if (*p1 == *p2)
|
||||||
|
continue;
|
||||||
|
ULONG index = i;
|
||||||
|
LPDWORD pos1 = p1++, pos2 = p2++;
|
||||||
|
for (i += 4; i < ulCompareLength && *p1 != *p2; i += 4, ++p1, ++p2);
|
||||||
|
ULONG ulCount = i - index;
|
||||||
|
memcpy(pos1, pos2, ulCount);
|
||||||
|
|
||||||
|
*(LPDWORD)(p) = index + startPostion;
|
||||||
|
*(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio;
|
||||||
|
p += 2 * sizeof(ULONG);
|
||||||
|
|
||||||
|
if (channel != 1) {
|
||||||
|
memcpy(p, pos2, ulCount);
|
||||||
|
p += ulCount;
|
||||||
|
} else {
|
||||||
|
for (LPBYTE end = p + ulCount / ratio; p < end; ++p, ++pos2) {
|
||||||
|
LPBYTE src = (LPBYTE)pos2;
|
||||||
|
*p = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (ULONG)(p - szBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
//============================== SSE2 Version ==============================
|
||||||
|
ULONG CompareBitmap_SSE2(LPBYTE CompareSourData, LPBYTE CompareDestData, LPBYTE szBuffer,
|
||||||
|
DWORD ulCompareLength, BYTE algo, int startPostion = 0)
|
||||||
|
{
|
||||||
|
LPBYTE p = szBuffer;
|
||||||
|
ULONG channel = algo == ALGORITHM_GRAY ? 1 : 4;
|
||||||
|
ULONG ratio = algo == ALGORITHM_GRAY ? 4 : 1;
|
||||||
|
|
||||||
|
const ULONG SSE_BLOCK = 16;
|
||||||
|
const ULONG alignedLength = ulCompareLength & ~(SSE_BLOCK - 1);
|
||||||
|
|
||||||
|
__m128i* v1 = (__m128i*)CompareDestData;
|
||||||
|
__m128i* v2 = (__m128i*)CompareSourData;
|
||||||
|
|
||||||
|
ULONG i = 0;
|
||||||
|
while (i < alignedLength) {
|
||||||
|
__m128i cmp = _mm_cmpeq_epi32(*v1, *v2);
|
||||||
|
int mask = _mm_movemask_epi8(cmp);
|
||||||
|
|
||||||
|
if (mask == 0xFFFF) {
|
||||||
|
i += SSE_BLOCK;
|
||||||
|
++v1;
|
||||||
|
++v2;
|
||||||
|
continue;
|
||||||
|
}
|
||||||
|
|
||||||
|
ULONG index = i;
|
||||||
|
LPBYTE pos1 = (LPBYTE)v1;
|
||||||
|
LPBYTE pos2 = (LPBYTE)v2;
|
||||||
|
|
||||||
|
do {
|
||||||
|
i += SSE_BLOCK;
|
||||||
|
++v1;
|
||||||
|
++v2;
|
||||||
|
if (i >= alignedLength) break;
|
||||||
|
cmp = _mm_cmpeq_epi32(*v1, *v2);
|
||||||
|
mask = _mm_movemask_epi8(cmp);
|
||||||
|
} while (mask != 0xFFFF);
|
||||||
|
|
||||||
|
ULONG ulCount = i - index;
|
||||||
|
memcpy(pos1, pos2, ulCount);
|
||||||
|
|
||||||
|
*(LPDWORD)(p) = index + startPostion;
|
||||||
|
*(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio;
|
||||||
|
p += 2 * sizeof(ULONG);
|
||||||
|
|
||||||
|
if (channel != 1) {
|
||||||
|
memcpy(p, pos2, ulCount);
|
||||||
|
p += ulCount;
|
||||||
|
} else {
|
||||||
|
ConvertToGray_SSE2(p, pos2, ulCount);
|
||||||
|
p += ulCount / ratio;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
// Handle remaining bytes
|
||||||
|
if (i < ulCompareLength) {
|
||||||
|
LPDWORD p1 = (LPDWORD)((LPBYTE)CompareDestData + i);
|
||||||
|
LPDWORD p2 = (LPDWORD)((LPBYTE)CompareSourData + i);
|
||||||
|
|
||||||
|
for (; i < ulCompareLength; i += 4, ++p1, ++p2) {
|
||||||
|
if (*p1 == *p2)
|
||||||
|
continue;
|
||||||
|
|
||||||
|
ULONG index = i;
|
||||||
|
LPDWORD pos1 = p1++;
|
||||||
|
LPDWORD pos2 = p2++;
|
||||||
|
|
||||||
|
for (i += 4; i < ulCompareLength && *p1 != *p2; i += 4, ++p1, ++p2);
|
||||||
|
ULONG ulCount = i - index;
|
||||||
|
memcpy(pos1, pos2, ulCount);
|
||||||
|
|
||||||
|
*(LPDWORD)(p) = index + startPostion;
|
||||||
|
*(LPDWORD)(p + sizeof(ULONG)) = ulCount / ratio;
|
||||||
|
p += 2 * sizeof(ULONG);
|
||||||
|
|
||||||
|
if (channel != 1) {
|
||||||
|
memcpy(p, pos2, ulCount);
|
||||||
|
p += ulCount;
|
||||||
|
} else {
|
||||||
|
LPDWORD srcPtr = pos2;
|
||||||
|
for (LPBYTE end = p + ulCount / ratio; p < end; ++p, ++srcPtr) {
|
||||||
|
LPBYTE src = (LPBYTE)srcPtr;
|
||||||
|
*p = (306 * src[2] + 601 * src[0] + 117 * src[1]) >> 10;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
return (ULONG)(p - szBuffer);
|
||||||
|
}
|
||||||
|
|
||||||
|
//============================== Benchmark ==============================
|
||||||
|
void RunBenchmark(int width, int height, float diffRatio, int iterations, BYTE algo = ALGORITHM_DIFF)
|
||||||
|
{
|
||||||
|
ULONG dataSize = width * height * 4;
|
||||||
|
|
||||||
|
LPBYTE srcBuffer = (LPBYTE)_aligned_malloc(dataSize, 16);
|
||||||
|
LPBYTE dstBuffer = (LPBYTE)_aligned_malloc(dataSize, 16);
|
||||||
|
LPBYTE outBuffer1 = (LPBYTE)_aligned_malloc(dataSize * 2, 16);
|
||||||
|
LPBYTE outBuffer2 = (LPBYTE)_aligned_malloc(dataSize * 2, 16);
|
||||||
|
|
||||||
|
if (!srcBuffer || !dstBuffer || !outBuffer1 || !outBuffer2) {
|
||||||
|
printf("Memory allocation failed!\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
srand(12345);
|
||||||
|
for (ULONG i = 0; i < dataSize; i++) {
|
||||||
|
srcBuffer[i] = rand() % 256;
|
||||||
|
dstBuffer[i] = srcBuffer[i];
|
||||||
|
}
|
||||||
|
|
||||||
|
int diffPixels = (int)(width * height * diffRatio);
|
||||||
|
for (int i = 0; i < diffPixels; i++) {
|
||||||
|
int pos = (rand() % (width * height)) * 4;
|
||||||
|
srcBuffer[pos] = rand() % 256;
|
||||||
|
srcBuffer[pos + 1] = rand() % 256;
|
||||||
|
srcBuffer[pos + 2] = rand() % 256;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n========== Test Parameters ==========\n");
|
||||||
|
printf("Resolution: %d x %d\n", width, height);
|
||||||
|
printf("Data size: %.2f MB\n", dataSize / 1024.0 / 1024.0);
|
||||||
|
printf("Diff ratio: %.1f%%\n", diffRatio * 100);
|
||||||
|
printf("Algorithm: %s\n", algo == ALGORITHM_GRAY ? "Gray" : "Color");
|
||||||
|
printf("Iterations: %d\n", iterations);
|
||||||
|
printf("======================================\n\n");
|
||||||
|
|
||||||
|
// Test original version
|
||||||
|
LPBYTE testDst1 = (LPBYTE)_aligned_malloc(dataSize, 16);
|
||||||
|
memcpy(testDst1, dstBuffer, dataSize);
|
||||||
|
|
||||||
|
auto start1 = std::chrono::high_resolution_clock::now();
|
||||||
|
ULONG result1 = 0;
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
memcpy(testDst1, dstBuffer, dataSize);
|
||||||
|
result1 = CompareBitmap_Original(srcBuffer, testDst1, outBuffer1, dataSize, algo);
|
||||||
|
}
|
||||||
|
auto end1 = std::chrono::high_resolution_clock::now();
|
||||||
|
double time1 = std::chrono::duration<double, std::milli>(end1 - start1).count();
|
||||||
|
|
||||||
|
// Test SSE2 version
|
||||||
|
LPBYTE testDst2 = (LPBYTE)_aligned_malloc(dataSize, 16);
|
||||||
|
memcpy(testDst2, dstBuffer, dataSize);
|
||||||
|
|
||||||
|
auto start2 = std::chrono::high_resolution_clock::now();
|
||||||
|
ULONG result2 = 0;
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
memcpy(testDst2, dstBuffer, dataSize);
|
||||||
|
result2 = CompareBitmap_SSE2(srcBuffer, testDst2, outBuffer2, dataSize, algo);
|
||||||
|
}
|
||||||
|
auto end2 = std::chrono::high_resolution_clock::now();
|
||||||
|
double time2 = std::chrono::duration<double, std::milli>(end2 - start2).count();
|
||||||
|
|
||||||
|
printf("Original:\n");
|
||||||
|
printf(" Total: %.2f ms\n", time1);
|
||||||
|
printf(" Per frame: %.3f ms\n", time1 / iterations);
|
||||||
|
printf(" Output size: %lu bytes\n\n", result1);
|
||||||
|
|
||||||
|
printf("SSE2:\n");
|
||||||
|
printf(" Total: %.2f ms\n", time2);
|
||||||
|
printf(" Per frame: %.3f ms\n", time2 / iterations);
|
||||||
|
printf(" Output size: %lu bytes\n\n", result2);
|
||||||
|
|
||||||
|
printf("========== Performance ==========\n");
|
||||||
|
printf("Speedup: %.2fx\n", time1 / time2);
|
||||||
|
printf("Time saved: %.1f%%\n", (1.0 - time2 / time1) * 100);
|
||||||
|
|
||||||
|
if (result1 == result2 && memcmp(outBuffer1, outBuffer2, result1) == 0) {
|
||||||
|
printf("Verify: PASS\n");
|
||||||
|
} else {
|
||||||
|
printf("Verify: DIFF (size: %lu vs %lu)\n", result1, result2);
|
||||||
|
}
|
||||||
|
printf("=================================\n");
|
||||||
|
|
||||||
|
_aligned_free(srcBuffer);
|
||||||
|
_aligned_free(dstBuffer);
|
||||||
|
_aligned_free(outBuffer1);
|
||||||
|
_aligned_free(outBuffer2);
|
||||||
|
_aligned_free(testDst1);
|
||||||
|
_aligned_free(testDst2);
|
||||||
|
}
|
||||||
|
|
||||||
|
//============================== Gray Convert Benchmark ==============================
|
||||||
|
void RunGrayConvertBenchmark(int width, int height, int iterations)
|
||||||
|
{
|
||||||
|
ULONG dataSize = width * height * 4;
|
||||||
|
ULONG graySize = width * height;
|
||||||
|
|
||||||
|
LPBYTE srcBuffer = (LPBYTE)_aligned_malloc(dataSize, 16);
|
||||||
|
LPBYTE dstBuffer1 = (LPBYTE)_aligned_malloc(graySize, 16);
|
||||||
|
LPBYTE dstBuffer2 = (LPBYTE)_aligned_malloc(graySize, 16);
|
||||||
|
|
||||||
|
if (!srcBuffer || !dstBuffer1 || !dstBuffer2) {
|
||||||
|
printf("Memory allocation failed!\n");
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
|
||||||
|
srand(12345);
|
||||||
|
for (ULONG i = 0; i < dataSize; i++) {
|
||||||
|
srcBuffer[i] = rand() % 256;
|
||||||
|
}
|
||||||
|
|
||||||
|
printf("\n========== BGRA->Gray Test ==========\n");
|
||||||
|
printf("Resolution: %d x %d\n", width, height);
|
||||||
|
printf("Input: %.2f MB, Output: %.2f MB\n", dataSize / 1024.0 / 1024.0, graySize / 1024.0 / 1024.0);
|
||||||
|
printf("Iterations: %d\n", iterations);
|
||||||
|
printf("=====================================\n\n");
|
||||||
|
|
||||||
|
// Test original version
|
||||||
|
auto start1 = std::chrono::high_resolution_clock::now();
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
ConvertToGray_Original(dstBuffer1, srcBuffer, dataSize);
|
||||||
|
}
|
||||||
|
auto end1 = std::chrono::high_resolution_clock::now();
|
||||||
|
double time1 = std::chrono::duration<double, std::milli>(end1 - start1).count();
|
||||||
|
|
||||||
|
// Test SSE2 version
|
||||||
|
auto start2 = std::chrono::high_resolution_clock::now();
|
||||||
|
for (int i = 0; i < iterations; i++) {
|
||||||
|
ConvertToGray_SSE2(dstBuffer2, srcBuffer, dataSize);
|
||||||
|
}
|
||||||
|
auto end2 = std::chrono::high_resolution_clock::now();
|
||||||
|
double time2 = std::chrono::duration<double, std::milli>(end2 - start2).count();
|
||||||
|
|
||||||
|
printf("Original (per-pixel):\n");
|
||||||
|
printf(" Total: %.2f ms, Per frame: %.3f ms\n", time1, time1 / iterations);
|
||||||
|
|
||||||
|
printf("\nSSE2 (4-pixel batch):\n");
|
||||||
|
printf(" Total: %.2f ms, Per frame: %.3f ms\n", time2, time2 / iterations);
|
||||||
|
|
||||||
|
printf("\n========== Performance ==========\n");
|
||||||
|
printf("Speedup: %.2fx\n", time1 / time2);
|
||||||
|
printf("Time saved: %.1f%%\n", (1.0 - time2 / time1) * 100);
|
||||||
|
|
||||||
|
bool match = memcmp(dstBuffer1, dstBuffer2, graySize) == 0;
|
||||||
|
printf("Verify: %s\n", match ? "PASS" : "FAIL");
|
||||||
|
printf("=================================\n");
|
||||||
|
|
||||||
|
_aligned_free(srcBuffer);
|
||||||
|
_aligned_free(dstBuffer1);
|
||||||
|
_aligned_free(dstBuffer2);
|
||||||
|
}
|
||||||
|
|
||||||
|
int main()
|
||||||
|
{
|
||||||
|
printf("===== Image Diff Algorithm Benchmark =====\n");
|
||||||
|
|
||||||
|
printf("\n\n########## Color Mode ##########\n");
|
||||||
|
|
||||||
|
printf("\n[1080p 10%% diff - Color]");
|
||||||
|
RunBenchmark(1920, 1080, 0.10f, 100, ALGORITHM_DIFF);
|
||||||
|
|
||||||
|
printf("\n[1080p 30%% diff - Color]");
|
||||||
|
RunBenchmark(1920, 1080, 0.30f, 100, ALGORITHM_DIFF);
|
||||||
|
|
||||||
|
printf("\n\n########## Gray Mode ##########\n");
|
||||||
|
|
||||||
|
printf("\n[1080p 10%% diff - Gray]");
|
||||||
|
RunBenchmark(1920, 1080, 0.10f, 100, ALGORITHM_GRAY);
|
||||||
|
|
||||||
|
printf("\n[1080p 30%% diff - Gray]");
|
||||||
|
RunBenchmark(1920, 1080, 0.30f, 100, ALGORITHM_GRAY);
|
||||||
|
|
||||||
|
printf("\n\n########## BGRA->Gray Conversion ##########\n");
|
||||||
|
|
||||||
|
printf("\n[1080p BGRA->Gray]");
|
||||||
|
RunGrayConvertBenchmark(1920, 1080, 100);
|
||||||
|
|
||||||
|
printf("\n[4K BGRA->Gray]");
|
||||||
|
RunGrayConvertBenchmark(3840, 2160, 50);
|
||||||
|
|
||||||
|
printf("\nDone!\n");
|
||||||
|
return 0;
|
||||||
|
}
|
||||||
Reference in New Issue
Block a user