From b4815227b7d2282c7a397c7cadb7bc9f3483480d Mon Sep 17 00:00:00 2001 From: lltcggie Date: Sat, 6 Jun 2015 21:40:55 +0900 Subject: [PATCH] =?UTF-8?q?=E5=88=86=E5=89=B2=E3=83=96=E3=83=AD=E3=83=83?= =?UTF-8?q?=E3=82=AF=E3=82=92=E4=B8=A6=E5=88=97=E3=81=A7=E5=A4=89=E6=8F=9B?= =?UTF-8?q?=E3=81=99=E3=82=8B=E3=82=88=E3=81=86=E3=81=AB=E3=81=97=E3=81=9F?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- common/tinypl.hpp | 235 ++++++++++++++++++ common/waifu2x.cpp | 167 +++++++++---- common/waifu2x.h | 28 ++- waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj | 5 +- .../waifu2x-caffe-gui.vcxproj.filters | 3 + waifu2x-caffe/waifu2x-caffe.vcxproj | 5 +- waifu2x-caffe/waifu2x-caffe.vcxproj.filters | 3 + 7 files changed, 388 insertions(+), 58 deletions(-) create mode 100644 common/tinypl.hpp diff --git a/common/tinypl.hpp b/common/tinypl.hpp new file mode 100644 index 0000000..8102b8f --- /dev/null +++ b/common/tinypl.hpp @@ -0,0 +1,235 @@ +/** + * Tiny Parallel Library + * + * The library implements following parallel algorithms that its + * interface are compatible with Intel TBB and Microsoft PPL. + * - parallel_for_each(first,last,func) + * - parallel_for(first,last,func) + * - parallel_invoke(f1,f2,...) (up to 4 args) + */ +#ifndef TINYPL_HPP +#define TINYPL_HPP + +#include +#include +#include +#include +#include +#include +//#define BOOST_ASIO_ENABLE_HANDLER_TRACKING +#include + +#ifndef TINYPL_WORKERNUM +#define TINYPL_WORKERNUM 0 +#endif + +#ifndef TINYPL_MIN_ITERATE +#define TINYPL_MIN_ITERATE 1 +#endif + + +namespace tinypl +{ + namespace impl { + + // task scheduler + class scheduler { + friend struct waiter; + public: + explicit scheduler(std::size_t thnum = 0) + : worker_(new boost::asio::io_service::work(iosrv_)) + { + if (thnum == 0) + thnum = std::max(1u, std::thread::hardware_concurrency()); + thnum_ = thnum; + // start worker threads + for (std::size_t i = 0; i < thnum - 1; ++i) + thpool_.emplace_back(std::bind(static_cast(&boost::asio::io_service::run), &iosrv_)); + } + + ~scheduler() + { + // stop all worker threads + worker_.reset(); + for (auto &th : thpool_) + th.join(); + } + + // # of worker threads + std::size_t worker_num() const { return thnum_; } + + // euqueue task + template + void enqueue(F f) { iosrv_.post(f); } + + std::vector get_thread_pool_id_list() const + { + std::vector list; + list.reserve(thpool_.size()); + for (const auto &th : thpool_) + list.push_back(th.get_id()); + + return list; + } + + public: + // get scheduler object + static scheduler& instance() + { + static scheduler sched(TINYPL_WORKERNUM); + return sched; + } + + private: + boost::asio::io_service iosrv_; + std::unique_ptr worker_; + std::vector thpool_; + std::size_t thnum_; + }; + + // task waiter + struct waiter { + scheduler& sched_; + volatile std::atomic count_; + + waiter(scheduler& sched, unsigned int count) + : sched_(sched), count_(count) {} + ~waiter() + { + while (0 < count_) { + sched_.iosrv_.poll_one(); + // FIXME: It may cause heavy busyloop in worst-case scenario. + } + } + + struct holder { + explicit holder(waiter& w) : w_(w) {} + ~holder() { w_.count_--; } + waiter& w_; + }; + }; + + + // task of parallel_for_each algorithm + template + void parallel_foreach_task(waiter* w, Itr first, Itr last, const Func& func) + { + waiter::holder h(*w); + while (first != last) + func(*first++); + } + + // task of parallel_for algorithm + template + void parallel_for_task(waiter* w, IdxType first, IdxType last, const Func& func) + { + waiter::holder h(*w); + while (first < last) + func(first++); + } + + // task of parallel_invoke algorithm + template + void parallel_invoke_task(waiter* w, const Func& func) + { + waiter::holder h(*w); + func(); + } + + } // namespace impl + + + /** + * parallel_for_each algorithm + */ + template + void parallel_for_each(Itr first, Itr last, const Func& func) + { + impl::scheduler& sched = impl::scheduler::instance(); + std::size_t range = std::distance(first, last); + std::size_t block = std::max(range / sched.worker_num(), std::size_t(TINYPL_MIN_ITERATE)); + impl::waiter w(sched, (range + block - 1) / block); + for (Itr next = first; first != last; first = next) { + std::advance(next, std::min(range, block)); + range -= std::min(range, block); + if (next != last) { + sched.enqueue(boost::bind(&impl::parallel_foreach_task, &w, first, next, func)); + } + else { + impl::parallel_foreach_task(&w, first, next, func); + } + } + } + + /** + * parallel_for algorithm + */ + template + void parallel_for(impl::scheduler& sched, IdxType first, IdxType last, const Func& func) + { + IdxType range = last - first; + IdxType block = static_cast(std::max(range / sched.worker_num(), std::size_t(TINYPL_MIN_ITERATE))); + impl::waiter w(sched, (range + block - 1) / block); + for (IdxType next = first; first < last; first = next) { + next = std::min(last, next + block); + if (next < last) { + sched.enqueue(std::bind(&impl::parallel_for_task, &w, first, next, func)); + } + else { + impl::parallel_for_task(&w, first, next, func); + } + } + } + + /** + * parallel_for algorithm + */ + template + void parallel_for(IdxType first, IdxType last, const Func& func) + { + impl::scheduler& sched = impl::scheduler::instance(); + parallel_for(sched, fist, last, func); + } + + /** + * parallel_invoke algorithm (2 args) + */ + template + void parallel_invoke(const F1& f1, const F2& f2) + { + impl::scheduler& sched = impl::scheduler::instance(); + impl::waiter w(sched, 1); + sched.enqueue(boost::bind(&impl::parallel_invoke_task, &w, f1)); + f2(); + } + + /** + * parallel_invoke algorithm (3 args) + */ + template + void parallel_invoke(const F1& f1, const F2& f2, const F3& f3) + { + impl::scheduler& sched = impl::scheduler::instance(); + impl::waiter w(sched, 2); + sched.enqueue(boost::bind(&impl::parallel_invoke_task, &w, f1)); + sched.enqueue(boost::bind(&impl::parallel_invoke_task, &w, f2)); + f3(); + } + + /** + * parallel_invoke algorithm (4 args) + */ + template + void parallel_invoke(const F1& f1, const F2& f2, const F3& f3, const F4& f4) + { + impl::scheduler& sched = impl::scheduler::instance(); + impl::waiter w(sched, 3); + sched.enqueue(boost::bind(&impl::parallel_invoke_task, &w, f1)); + sched.enqueue(boost::bind(&impl::parallel_invoke_task, &w, f2)); + sched.enqueue(boost::bind(&impl::parallel_invoke_task, &w, f3)); + f4(); + } + +} // namespace tinypl + +#endif diff --git a/common/waifu2x.cpp b/common/waifu2x.cpp index 2a243e9..49de440 100644 --- a/common/waifu2x.cpp +++ b/common/waifu2x.cpp @@ -9,6 +9,7 @@ #include #include #include +#include "tinypl.hpp" #if defined(WIN32) || defined(WIN64) #include @@ -62,7 +63,7 @@ static std::once_flag waifu2x_cudnn_once_flag; } \ } while (0) -Waifu2x::Waifu2x() : is_inited(false), isCuda(false), block(nullptr), dummy_data(nullptr), out_block(nullptr) +Waifu2x::Waifu2x() : job(1), is_inited(false), isCuda(false) { } @@ -375,7 +376,7 @@ Waifu2x::eWaifu2xError Waifu2x::ConstractNet(boost::shared_ptr } // ネットワークを使って画像を再構築する -Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptr> net, cv::Mat &im) +Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(std::vector>> nets, cv::Mat &im) { const auto Height = im.size().height; const auto Width = im.size().width; @@ -390,18 +391,6 @@ Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptr>( - net->layer_by_name("image_input_layer")); - assert(input_layer); - - const auto conv7_layer = - boost::dynamic_pointer_cast>( - net->layer_by_name("conv7_layer")); - assert(conv7_layer); - - input_layer->set_batch_size(batch_size); - const int WidthNum = Width / output_size; const int HeightNum = Height / output_size; @@ -410,9 +399,41 @@ Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptr>( + net->layer_by_name("image_input_layer")); + assert(input_layer); + + input_layer->set_batch_size(batch_size); + } + + // 画像は(消費メモリの都合上)output_size*output_sizeに分けて再構築する + tinypl::parallel_for(*net_scheduler, 0, BatchNum, [&](const int batch_n) + { + const auto id = std::this_thread::get_id(); + const auto net_scheduler_id_map_it = net_scheduler_id_map.find(id); + + assert(net_scheduler_id_map_it != net_scheduler_id_map.end()); + + const int index = net_scheduler_id_map_it->second; + + auto net = nets[index]; + + float *block = blocks[index]; + float *dummy_data = dummy_datas[index]; + float *out_block = out_blocks[index]; + + const auto input_layer = + boost::dynamic_pointer_cast>( + net->layer_by_name("image_input_layer")); + assert(input_layer); + + const int num = batch_n * batch_size; + const int processNum = (BlockNum - num) >= batch_size ? batch_size : BlockNum - num; if (processNum < batch_size) @@ -455,7 +476,7 @@ Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptrReset(block, dummy_data, input_block_plane_size * batch_size); + input_layer->Reset(block, dummy_data, input_block_plane_size * processNum); // 計算 auto out = net->ForwardPrefilled(nullptr); @@ -487,7 +508,7 @@ Waifu2x::eWaifu2xError Waifu2x::ReconstructImage(boost::shared_ptrget_thread_pool_id_list(); + for (size_t i = 0; i < list.size(); i++) + net_scheduler_id_map.emplace(list[i], i); + + net_scheduler_id_map.emplace(std::this_thread::get_id(), list.size()); is_inited = true; } @@ -631,20 +688,34 @@ Waifu2x::eWaifu2xError Waifu2x::init(int argc, char** argv, const std::string &M void Waifu2x::destroy() { - net_noise.reset(); - net_scale.reset(); + net_scheduler.reset(); + + net_noises.clear(); + net_scales.clear(); if (isCuda) { - CUDA_HOST_SAFE_FREE(block); - CUDA_HOST_SAFE_FREE(dummy_data); - CUDA_HOST_SAFE_FREE(out_block); + for (auto &block : blocks) { + CUDA_HOST_SAFE_FREE(block); + } + for (auto &dummy_data : dummy_datas) { + CUDA_HOST_SAFE_FREE(dummy_data); + } + for (auto &out_block : out_blocks) { + CUDA_HOST_SAFE_FREE(out_block); + } } else { - SAFE_DELETE_WAIFU2X(block); - SAFE_DELETE_WAIFU2X(dummy_data); - SAFE_DELETE_WAIFU2X(out_block); + for (auto &block : blocks) { + SAFE_DELETE_WAIFU2X(block); + } + for (auto &dummy_data : dummy_datas) { + SAFE_DELETE_WAIFU2X(dummy_data); + } + for (auto &out_block : out_blocks) { + SAFE_DELETE_WAIFU2X(out_block); + } } is_inited = false; @@ -680,7 +751,7 @@ Waifu2x::eWaifu2xError Waifu2x::waifu2x(const std::string &input_file, const std { PaddingImage(im, im); - ret = ReconstructImage(net_noise, im); + ret = ReconstructImage(net_noises, im); if (ret != eWaifu2xError_OK) return ret; @@ -701,7 +772,7 @@ Waifu2x::eWaifu2xError Waifu2x::waifu2x(const std::string &input_file, const std { Zoom2xAndPaddingImage(im, im, image_size); - ret = ReconstructImage(net_scale, im); + ret = ReconstructImage(net_scales, im); if (ret != eWaifu2xError_OK) return ret; diff --git a/common/waifu2x.h b/common/waifu2x.h index b35293c..4ec4c31 100644 --- a/common/waifu2x.h +++ b/common/waifu2x.h @@ -5,8 +5,11 @@ #include #include #include +#include +#include #include #include +#include namespace caffe @@ -15,6 +18,15 @@ namespace caffe class Net; }; +namespace tinypl +{ + namespace impl + { + // task scheduler + class scheduler; + } +} + class Waifu2x { public: @@ -64,15 +76,19 @@ private: double scale_ratio; std::string model_dir; std::string process; + int job; bool isCuda; - boost::shared_ptr> net_noise; - boost::shared_ptr> net_scale; + std::vector>> net_noises; + std::vector>> net_scales; - float *block; - float *dummy_data; - float *out_block; + std::vector blocks; + std::vector dummy_datas; + std::vector out_blocks; + + std::unique_ptr net_scheduler; + std::unordered_map net_scheduler_id_map; private: eWaifu2xError LoadImage(cv::Mat &float_image, const std::string &input_file); @@ -82,7 +98,7 @@ private: eWaifu2xError CreateZoomColorImage(const cv::Mat &float_image, const cv::Size_ &zoom_size, std::vector &cubic_planes); eWaifu2xError LoadParameter(boost::shared_ptr> net, const std::string ¶m_path); eWaifu2xError ConstractNet(boost::shared_ptr> &net, const std::string &model_path, const std::string &process); - eWaifu2xError ReconstructImage(boost::shared_ptr> net, cv::Mat &im); + eWaifu2xError ReconstructImage(std::vector>> nets, cv::Mat &im); public: Waifu2x(); diff --git a/waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj b/waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj index 01abfa8..6bf6272 100644 --- a/waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj +++ b/waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj @@ -55,7 +55,7 @@ Level3 Disabled - WIN32;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) + WIN32;_WIN32_WINNT=0x0600;_DEBUG;_WINDOWS;%(PreprocessorDefinitions) Windows @@ -74,7 +74,7 @@ MaxSpeed true true - WIN32;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) + WIN32;_WIN32_WINNT=0x0600;NDEBUG;_WINDOWS;%(PreprocessorDefinitions) Windows @@ -95,6 +95,7 @@ + diff --git a/waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj.filters b/waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj.filters index 4282ea2..dbb02c5 100644 --- a/waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj.filters +++ b/waifu2x-caffe-gui/waifu2x-caffe-gui.vcxproj.filters @@ -56,6 +56,9 @@ 繝倥ャ繝繝シ 繝輔ぃ繧、繝ォ + + 繝倥ャ繝繝シ 繝輔ぃ繧、繝ォ + diff --git a/waifu2x-caffe/waifu2x-caffe.vcxproj b/waifu2x-caffe/waifu2x-caffe.vcxproj index b374766..b7295f8 100644 --- a/waifu2x-caffe/waifu2x-caffe.vcxproj +++ b/waifu2x-caffe/waifu2x-caffe.vcxproj @@ -55,7 +55,7 @@ Level3 Disabled - WIN32;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) + WIN32;_WIN32_WINNT=0x0600;_DEBUG;_CONSOLE;%(PreprocessorDefinitions) Console @@ -71,7 +71,7 @@ MaxSpeed true true - WIN32;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) + WIN32;_WIN32_WINNT=0x0600;NDEBUG;_CONSOLE;%(PreprocessorDefinitions) Console @@ -86,6 +86,7 @@ + diff --git a/waifu2x-caffe/waifu2x-caffe.vcxproj.filters b/waifu2x-caffe/waifu2x-caffe.vcxproj.filters index 86e4980..4305a80 100644 --- a/waifu2x-caffe/waifu2x-caffe.vcxproj.filters +++ b/waifu2x-caffe/waifu2x-caffe.vcxproj.filters @@ -26,5 +26,8 @@ 繝倥ャ繝繝シ 繝輔ぃ繧、繝ォ + + 繝倥ャ繝繝シ 繝輔ぃ繧、繝ォ + \ No newline at end of file