diff --git a/benchmark/benchncnn.cpp b/benchmark/benchncnn.cpp index 1d4ae1d755e..714dca3180f 100644 --- a/benchmark/benchncnn.cpp +++ b/benchmark/benchncnn.cpp @@ -161,8 +161,8 @@ void benchmark(const char* comment, const ncnn::Mat& _in, const ncnn::Option& op int main(int argc, char** argv) { int loop_count = 4; - int num_threads = ncnn::get_cpu_count(); - int powersave = 0; + int num_threads = ncnn::get_physical_big_cpu_count(); + int powersave = 2; int gpu_device = -1; int cooling_down = 1; diff --git a/src/cpu.cpp b/src/cpu.cpp index c84c6491157..eb012c31587 100644 --- a/src/cpu.cpp +++ b/src/cpu.cpp @@ -42,6 +42,12 @@ #include #endif +#if defined _WIN32 && !(defined __MINGW32__) +#define WIN32_LEAN_AND_MEAN +#include +#include +#endif + #if defined __ANDROID__ || defined __linux__ #if defined __ANDROID__ #if __ANDROID_API__ >= 18 @@ -296,7 +302,44 @@ static int g_hw_optional_arm_FEAT_BF16 = get_hw_capability("hw.optional.arm.FEAT static int g_hw_optional_arm_FEAT_I8MM = get_hw_capability("hw.optional.arm.FEAT_I8MM"); #endif // __APPLE__ -#if defined __ANDROID__ || defined __linux__ +#if (defined _WIN32 && !(defined __MINGW32__)) +CpuSet::CpuSet() +{ + disable_all(); +} + +void CpuSet::enable(int cpu) +{ + mask |= (1 << cpu); +} + +void CpuSet::disable(int cpu) +{ + mask &= ~(1 << cpu); +} + +void CpuSet::disable_all() +{ + mask = 0; +} + +bool CpuSet::is_enabled(int cpu) const +{ + return mask & (1 << cpu); +} + +int CpuSet::num_enabled() const +{ + int num_enabled = 0; + for (int i = 0; i < (int)sizeof(mask) * 8; i++) + { + if (is_enabled(i)) + num_enabled++; + } + + return num_enabled; +} +#elif defined __ANDROID__ || defined __linux__ CpuSet::CpuSet() { disable_all(); @@ -1109,6 +1152,10 @@ static int get_cpucount() count = emscripten_num_logical_cores(); else count = 1; +#elif (defined _WIN32 && !(defined __MINGW32__)) + SYSTEM_INFO system_info; + GetSystemInfo(&system_info); + count = system_info.dwNumberOfProcessors; #elif defined __ANDROID__ || defined __linux__ // get cpu count from /proc/cpuinfo FILE* fp = fopen("/proc/cpuinfo", "rb"); @@ -1164,6 +1211,220 @@ int get_big_cpu_count() return big_cpu_count ? big_cpu_count : g_cpucount; } +#if defined __ANDROID__ || defined __linux__ +static int get_thread_siblings(int cpuid) +{ + char path[256]; + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings", cpuid); + + FILE* fp = fopen(path, "rb"); + if (!fp) + return -1; + + int thread_siblings = -1; + int nscan = fscanf(fp, "%x", &thread_siblings); + if (nscan != 1) + { + // ignore + } + + fclose(fp); + + return thread_siblings; +} +#endif // defined __ANDROID__ || defined __linux__ + +static int get_physical_cpucount() +{ + int count = 0; +#if (defined _WIN32 && !(defined __MINGW32__)) + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi == NULL) + { + NCNN_LOGE("GetLogicalProcessorInformation is not supported"); + return g_cpucount; + } + + DWORD return_length = 0; + glpi(NULL, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationProcessorCore) + { + count++; + } + + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + free(buffer); +#elif defined __ANDROID__ || defined __linux__ + std::vector thread_set; + for (int i = 0; i < g_cpucount; i++) + { + int thread_siblings = get_thread_siblings(i); + if (thread_siblings == -1) + { + // ignore malformed one + continue; + } + + bool thread_siblings_exists = false; + for (size_t j = 0; j < thread_set.size(); j++) + { + if (thread_set[j] == thread_siblings) + { + thread_siblings_exists = true; + break; + } + } + + if (!thread_siblings_exists) + { + thread_set.push_back(thread_siblings); + count++; + } + } +#elif __APPLE__ + size_t len = sizeof(count); + sysctlbyname("hw.physicalcpu_max", &count, &len, NULL, 0); +#else + count = g_cpucount; +#endif + + if (count > g_cpucount) + count = g_cpucount; + + return count; +} + +static int g_physical_cpucount = get_physical_cpucount(); + +int get_physical_cpu_count() +{ + return g_physical_cpucount; +} + +int get_physical_little_cpu_count() +{ + if (g_physical_cpucount == g_cpucount) + return get_little_cpu_count(); + + return g_physical_cpucount * 2 - g_cpucount; +} + +int get_physical_big_cpu_count() +{ + if (g_physical_cpucount == g_cpucount) + return get_big_cpu_count(); + + return g_cpucount - g_physical_cpucount; +} + +#if (defined _WIN32 && !(defined __MINGW32__)) +static CpuSet get_smt_cpu_mask() +{ + CpuSet smt_cpu_mask; + + typedef BOOL(WINAPI * LPFN_GLPI)(PSYSTEM_LOGICAL_PROCESSOR_INFORMATION, PDWORD); + LPFN_GLPI glpi = (LPFN_GLPI)GetProcAddress(GetModuleHandle(TEXT("kernel32")), "GetLogicalProcessorInformation"); + if (glpi == NULL) + { + NCNN_LOGE("GetLogicalProcessorInformation is not supported"); + return smt_cpu_mask; + } + + DWORD return_length = 0; + glpi(NULL, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION buffer = (PSYSTEM_LOGICAL_PROCESSOR_INFORMATION)malloc(return_length); + glpi(buffer, &return_length); + + PSYSTEM_LOGICAL_PROCESSOR_INFORMATION ptr = buffer; + DWORD byte_offset = 0; + while (byte_offset + sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION) <= return_length) + { + if (ptr->Relationship == RelationProcessorCore) + { + CpuSet smt_set; + smt_set.mask = ptr->ProcessorMask; + if (smt_set.num_enabled() > 1) + { + // this core is smt + smt_cpu_mask.mask |= smt_set.mask; + } + } + + byte_offset += sizeof(SYSTEM_LOGICAL_PROCESSOR_INFORMATION); + ptr++; + } + + free(buffer); + + return smt_cpu_mask; +} + +static std::vector get_max_freq_mhz() +{ + typedef struct _PROCESSOR_POWER_INFORMATION + { + ULONG Number; + ULONG MaxMhz; + ULONG CurrentMhz; + ULONG MhzLimit; + ULONG MaxIdleState; + ULONG CurrentIdleState; + } PROCESSOR_POWER_INFORMATION, *PPROCESSOR_POWER_INFORMATION; + + HMODULE powrprof = LoadLibrary(TEXT("powrprof.dll")); + + typedef LONG(WINAPI * LPFN_CNPI)(POWER_INFORMATION_LEVEL, PVOID, ULONG, PVOID, ULONG); + LPFN_CNPI cnpi = (LPFN_CNPI)GetProcAddress(powrprof, "CallNtPowerInformation"); + if (cnpi == NULL) + { + NCNN_LOGE("CallNtPowerInformation is not supported"); + FreeLibrary(powrprof); + return std::vector(g_cpucount, 0); + } + + DWORD return_length = sizeof(PROCESSOR_POWER_INFORMATION) * g_cpucount; + PPROCESSOR_POWER_INFORMATION buffer = (PPROCESSOR_POWER_INFORMATION)malloc(return_length); + + cnpi(ProcessorInformation, NULL, 0, buffer, return_length); + + std::vector ret; + for (int i = 0; i < g_cpucount; i++) + { + ULONG max_mhz = buffer[i].MaxMhz; + ret.push_back(max_mhz); + } + + free(buffer); + FreeLibrary(powrprof); + return ret; +} + +static int set_sched_affinity(const CpuSet& thread_affinity_mask) +{ + DWORD_PTR prev_mask = SetThreadAffinityMask(GetCurrentThread(), thread_affinity_mask.mask); + if (prev_mask == 0) + { + NCNN_LOGE("SetThreadAffinityMask failed %d", GetLastError()); + return -1; + } + + return 0; +} +#endif // (defined _WIN32 && !(defined __MINGW32__)) + #if defined __ANDROID__ || defined __linux__ static int get_max_freq_khz(int cpuid) { @@ -1239,6 +1500,39 @@ static int get_max_freq_khz(int cpuid) return max_freq_khz; } +static bool is_smt_cpu(int cpuid) +{ + // https://github.com/torvalds/linux/blob/v6.0/Documentation/ABI/stable/sysfs-devices-system-cpu#L68-72 + char path[256]; + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/core_cpus_list", cpuid); + + FILE* fp = fopen(path, "rb"); + + if (!fp) + { + sprintf(path, "/sys/devices/system/cpu/cpu%d/topology/thread_siblings_list", cpuid); + fp = fopen(path, "rb"); + + if (!fp) + return false; + } + + bool is_smt = false; + while (!feof(fp)) + { + char ch = fgetc(fp); + if (ch == ',' || ch == '-') + { + is_smt = true; + break; + } + } + + fclose(fp); + + return is_smt; +} + static int set_sched_affinity(const CpuSet& thread_affinity_mask) { // set affinity for thread @@ -1329,7 +1623,48 @@ static int setup_thread_affinity_masks() { g_thread_affinity_mask_all.disable_all(); -#if defined __ANDROID__ || defined __linux__ +#if (defined _WIN32 && !(defined __MINGW32__)) + // get max freq mhz for all cores + int max_freq_mhz_min = INT_MAX; + int max_freq_mhz_max = 0; + std::vector cpu_max_freq_mhz = get_max_freq_mhz(); + for (int i = 0; i < g_cpucount; i++) + { + int max_freq_mhz = cpu_max_freq_mhz[i]; + + // NCNN_LOGE("%d max freq = %d khz", i, max_freq_mhz); + + if (max_freq_mhz > max_freq_mhz_max) + max_freq_mhz_max = max_freq_mhz; + if (max_freq_mhz < max_freq_mhz_min) + max_freq_mhz_min = max_freq_mhz; + } + + int max_freq_mhz_medium = (max_freq_mhz_min + max_freq_mhz_max) / 2; + if (max_freq_mhz_medium == max_freq_mhz_max) + { + g_thread_affinity_mask_little.disable_all(); + g_thread_affinity_mask_big = g_thread_affinity_mask_all; + return 0; + } + + CpuSet smt_cpu_mask = get_smt_cpu_mask(); + + for (int i = 0; i < g_cpucount; i++) + { + if (smt_cpu_mask.is_enabled(i)) + { + // always treat smt core as big core + g_thread_affinity_mask_big.enable(i); + continue; + } + + if (cpu_max_freq_mhz[i] < max_freq_mhz_medium) + g_thread_affinity_mask_little.enable(i); + else + g_thread_affinity_mask_big.enable(i); + } +#elif defined __ANDROID__ || defined __linux__ int max_freq_khz_min = INT_MAX; int max_freq_khz_max = 0; std::vector cpu_max_freq_khz(g_cpucount); @@ -1357,6 +1692,13 @@ static int setup_thread_affinity_masks() for (int i = 0; i < g_cpucount; i++) { + if (is_smt_cpu(i)) + { + // always treat smt core as big core + g_thread_affinity_mask_big.enable(i); + continue; + } + if (cpu_max_freq_khz[i] < max_freq_khz_medium) g_thread_affinity_mask_little.enable(i); else @@ -1477,7 +1819,7 @@ const CpuSet& get_cpu_thread_affinity_mask(int powersave) int set_cpu_thread_affinity(const CpuSet& thread_affinity_mask) { -#if defined __ANDROID__ || defined __linux__ +#if defined __ANDROID__ || defined __linux__ || (defined _WIN32 && !(defined __MINGW32__)) int num_threads = thread_affinity_mask.num_enabled(); #ifdef _OPENMP diff --git a/src/cpu.h b/src/cpu.h index 5a94106ef47..c38061df0b1 100644 --- a/src/cpu.h +++ b/src/cpu.h @@ -17,6 +17,10 @@ #include +#if (defined _WIN32 && !(defined __MINGW32__)) +#define WIN32_LEAN_AND_MEAN +#include +#endif #if defined __ANDROID__ || defined __linux__ #include // cpu_set_t #endif @@ -36,6 +40,9 @@ class NCNN_EXPORT CpuSet int num_enabled() const; public: +#if (defined _WIN32 && !(defined __MINGW32__)) + ULONG_PTR mask; +#endif #if defined __ANDROID__ || defined __linux__ cpu_set_t cpu_set; #endif @@ -110,6 +117,10 @@ NCNN_EXPORT int get_cpu_count(); NCNN_EXPORT int get_little_cpu_count(); NCNN_EXPORT int get_big_cpu_count(); +NCNN_EXPORT int get_physical_cpu_count(); +NCNN_EXPORT int get_physical_little_cpu_count(); +NCNN_EXPORT int get_physical_big_cpu_count(); + // bind all threads on little clusters if powersave enabled // affects HMP arch cpu like ARM big.LITTLE // only implemented on android at the moment diff --git a/src/option.cpp b/src/option.cpp index 4aabfdde5ed..80d4455307e 100644 --- a/src/option.cpp +++ b/src/option.cpp @@ -21,7 +21,7 @@ namespace ncnn { Option::Option() { lightmode = true; - num_threads = get_big_cpu_count(); + num_threads = get_physical_big_cpu_count(); blob_allocator = 0; workspace_allocator = 0;