diff --git a/bin/models/upresnet10/noise0_scale2.0x_model.prototxt b/bin/models/upresnet10/noise0_scale2.0x_model.prototxt index c083cc7..9def1bf 100644 --- a/bin/models/upresnet10/noise0_scale2.0x_model.prototxt +++ b/bin/models/upresnet10/noise0_scale2.0x_model.prototxt @@ -196,7 +196,7 @@ layer { } layer { name: "/res1/axpy" - type: "Axpy" + type: "AxpyFast" bottom: "/res1/fc2_sigmoid" bottom: "/res1/conv2_relu" bottom: "/res1/crop" @@ -353,7 +353,7 @@ layer { } layer { name: "/res2/axpy" - type: "Axpy" + type: "AxpyFast" bottom: "/res2/fc2_sigmoid" bottom: "/res2/conv2_relu" bottom: "/res2/crop" @@ -510,7 +510,7 @@ layer { } layer { name: "/res3/axpy" - type: "Axpy" + type: "AxpyFast" bottom: "/res3/fc2_sigmoid" bottom: "/res3/conv2_relu" bottom: "/res3/crop" @@ -667,7 +667,7 @@ layer { } layer { name: "/res4/axpy" - type: "Axpy" + type: "AxpyFast" bottom: "/res4/fc2_sigmoid" bottom: "/res4/conv2_relu" bottom: "/res4/crop" @@ -824,7 +824,7 @@ layer { } layer { name: "/res5/axpy" - type: "Axpy" + type: "AxpyFast" bottom: "/res5/fc2_sigmoid" bottom: "/res5/conv2_relu" bottom: "/res5/crop" diff --git a/waifu2x-caffe/Test.cpp b/waifu2x-caffe/Test.cpp index ac559cc..1d07495 100644 --- a/waifu2x-caffe/Test.cpp +++ b/waifu2x-caffe/Test.cpp @@ -1,5 +1,6 @@ # include # include +# include # include # include # include @@ -280,18 +281,98 @@ public: } }; +static double sumAllElements(const cv::Mat& mat) +{ + CV_Assert(!mat.empty()); + + const cv::Scalar s = cv::sum(mat); // チャンネルごとの合計 + double total = 0.0; + for (int c = 0; c < mat.channels(); ++c) { + total += s[c]; + } + return total; +} + + +// ---- 内部実装 ---- +template +static void printRec(const cv::Mat& m, std::vector& idx, int d) { + if (d == m.dims - 1) { + // 最終軸:一次元の並びを出力 + const int cn = m.channels(); + std::cout << "["; + for (int i = 0; i < m.size[d]; ++i) { + idx[d] = i; + const T* p = m.ptr(idx.data()); // idx の位置の要素先頭(ch=0)へのポインタ + if (cn == 1) { + std::cout << p[0]; + } + else { + std::cout << "("; + for (int c = 0; c < cn; ++c) { + std::cout << p[c]; + if (c + 1 < cn) std::cout << ", "; + } + std::cout << ")"; + } + if (i + 1 < m.size[d]) std::cout << ", "; + } + std::cout << "]"; + } + else { + // 途中軸:再帰で内側へ + std::cout << "["; + for (int i = 0; i < m.size[d]; ++i) { + idx[d] = i; + printRec(m, idx, d + 1); + if (i + 1 < m.size[d]) std::cout << ",\n"; + } + std::cout << "]"; + } +} + +template +static void printMatND_T(const cv::Mat& m) { + // 浮動小数は小数桁を控えめに + if (std::is_floating_point::value) { + std::cout << std::fixed << std::setprecision(6); + } + std::vector idx(m.dims, 0); + printRec(m, idx, 0); + std::cout << std::endl; +} + +// エントリポイント(cv::Mat の depth に応じてディスパッチ) +static void printMatND(const cv::Mat& m) { + switch (m.depth()) { + case CV_8U: printMatND_T(m); break; + case CV_8S: printMatND_T(m); break; + case CV_16U: printMatND_T(m); break; + case CV_16S: printMatND_T(m); break; + case CV_32S: printMatND_T(m); break; + case CV_32F: printMatND_T(m); break; + case CV_64F: printMatND_T(m); break; + default: + throw std::runtime_error("Unsupported Mat depth."); + } +} + + void reg(); +void reg2(); int main(int argc, char** argv) { //CV_DNN_REGISTER_LAYER_CLASS(CropCenter, CropCenterLayer); reg(); + reg2(); // ImageNet Caffeリファレンスモデル string protoFile = "models/upresnet10/noise0_scale2.0x_model.prototxt"; string modelFile = "models/upresnet10/noise0_scale2.0x_model.json.caffemodel"; // 画像ファイル - string imageFile = (argc > 1) ? argv[1] : "images/cat.jpg"; + //string imageFile = (argc > 1) ? argv[1] : "images/cat.jpg"; + string imageFile = "red.png"; // Caffeモデルの読み込み cv::dnn::Net net; @@ -321,25 +402,37 @@ int main(int argc, char** argv) { cv::resize(img, img, cv::Size(cropSize, cropSize)); // Caffeで扱うBlob形式に変換 (実体はcv::Matのラッパークラス) const auto inputBlob = cv::dnn::blobFromImage(img, 1.0 / 255.0, cv::Size(), cv::Scalar(), true, false, CV_32F); + + //printMatND(inputBlob); + std::vector indim(inputBlob.size.p, inputBlob.size.p + inputBlob.size.dims()); // 入力層に画像を入力 - net.setInput(inputBlob); + net.setInput(inputBlob, "input"); // フォワードパス(順伝播)の計算&出力層(Softmax)の出力を取得, ここに予測結果が格納されている // ImageNet 1000クラス毎の確率(32bits浮動小数点値)が格納された1x1000の行列(ベクトル) - const auto probMat = net.forward(); + //const auto probMat = net.forward("/conv_post"); + const auto probMat = net.forward("/res1/axpy"); + + std::vector probMatDim(probMat.size.p, probMat.size.p + probMat.size.dims()); + auto sss = sumAllElements(probMat); + //printMatND(probMat); std::vector outImgs; cv::dnn::imagesFromBlob(probMat, outImgs); //cv::dnn::imagesFromBlob(inputBlob, outImgs); auto outImg = outImgs[0]; + std::vector outdim(outImg.size.p, outImg.size.p + outImg.size.dims()); + printMatND(outImg); + + //std::cout << cv::format(outImg, cv::Formatter::FMT_DEFAULT) << std::endl; + // 値を0〜1にクリッピング cv::threshold(outImg, outImg, 1.0, 1.0, cv::THRESH_TRUNC); cv::threshold(outImg, outImg, 0.0, 0.0, cv::THRESH_TOZERO); const double clip_eps8 = (1.0 / 255.0) * 0.5 - (1.0e-7 * (1.0 / 255.0) * 0.5); outImg.convertTo(outImg, CV_8U, 255.0, clip_eps8); - std::vector outdim(outImg.size.p, outImg.size.p + outImg.size.dims()); cv::cvtColor(outImg, outImg, cv::COLOR_RGB2BGR); diff --git a/waifu2x-caffe/axpy.hpp b/waifu2x-caffe/axpy.hpp new file mode 100644 index 0000000..ec86d83 --- /dev/null +++ b/waifu2x-caffe/axpy.hpp @@ -0,0 +1,106 @@ +// This file is part of OpenCV project. +// It is subject to the license terms in the LICENSE file found in the top-level directory +// of this distribution and at http://opencv.org/license.html. + +#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP +#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP + +//#include "../../op_cuda.hpp" +// +//#include "../csl/stream.hpp" +//#include "../csl/tensor.hpp" +// +//#include "../kernels/scale_shift.hpp" + +#include + +#include + +#include + +#include +#include + +namespace cv { + namespace dnn { + namespace cuda4dnn { + + template + class AxpyOp final : public CUDABackendNode { + public: + using wrapper_type = GetCUDABackendWrapperType; + + AxpyOp(csl::Stream stream_) + : stream(std::move(stream_)), axis(0) + { + } + + void forward( + const std::vector>& inputs, + const std::vector>& outputs, + csl::Workspace& workspace) override + { + CV_Assert(inputs.size() == 3); + CV_Assert(outputs.size() == 1); + + auto input_wrapper = inputs[1].dynamicCast(); + auto input = input_wrapper->getView(); + + auto output_wrapper = outputs[0].dynamicCast(); + auto output = output_wrapper->getSpan(); + + /* number of batches in the weights/bias + * trainable mode: same for all batches + * untrainable mode: could be different for different batch samples + */ + std::size_t parameter_batch_size = 1; + + csl::TensorView weights = inputs[0].dynamicCast()->getView(); + parameter_batch_size = weights.get_axis_size(0); + CV_Assert(parameter_batch_size == input.get_axis_size(0)); + + csl::TensorView bias = inputs[2].dynamicCast()->getView(); + parameter_batch_size = bias.get_axis_size(0); + CV_Assert(parameter_batch_size == input.get_axis_size(0)); + + CV_Assert(!weights.empty() || !bias.empty()); + if (!weights.empty() && !bias.empty()) + { + CV_CheckEQ(weights.size(), bias.size(), "different broadcasting options for weights and bias is not supported"); + } + + const auto num_parameters = !weights.empty() ? weights.size() : bias.size(); + const auto mid_size = num_parameters / parameter_batch_size; + + /* the scale shift operation might require broadcasting */ + const int end_axis = [&] { + if (num_parameters == 1) { + return static_cast(axis + 1); + } + for (int endAxis = axis + 1; endAxis <= input.rank(); endAxis++) { + if (input.size_range(axis, endAxis) == mid_size) + return endAxis; + } + CV_Assert(0 /* failed to find a broadcast config */); + }(); + + std::size_t inner_size = input.size_range(end_axis, input.rank()); + + if (!weights.empty() && !bias.empty()) + kernels::scaleN_with_biasN(stream, output, input, inner_size, weights, bias); + else if (!weights.empty()) + kernels::scaleN(stream, output, input, inner_size, weights); + else + kernels::biasN(stream, output, input, inner_size, bias); + } + + private: + csl::Stream stream; + std::size_t axis; + }; + + } + } +} /* namespace cv::dnn::cuda4dnn */ + +#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP */ diff --git a/waifu2x-caffe/axpy_fast_layer.cpp b/waifu2x-caffe/axpy_fast_layer.cpp new file mode 100644 index 0000000..f2546e3 --- /dev/null +++ b/waifu2x-caffe/axpy_fast_layer.cpp @@ -0,0 +1,371 @@ +#include +//#include +#include + +//#include +#include +#include +#include +#include +#include + +#include +#include + +#include +#include + +#ifdef HAVE_CUDA +//#include +#include "axpy.hpp" +using namespace cv::dnn::cuda4dnn; +#endif + +namespace cv +{ + namespace dnn + { + + class AxpyFastLayerImpl CV_FINAL : public Layer + { + public: +#ifdef HAVE_WEBNN + mutable int dims; + mutable int numChannels; +#endif + AxpyFastLayerImpl(const LayerParams& params) + { + setParamsFrom(params); + } + + bool getMemoryShapes(const std::vector& inputs, + const int requiredOutputs, + std::vector& outputs, + std::vector& internals) const CV_OVERRIDE + { + outputs.assign(1, inputs[1]); +#ifdef HAVE_WEBNN + dims = inputs[0].size(); + numChannels = 1; + if (inputs.size() > 1) + { + for (const size_t& dim : inputs[1]) + numChannels *= dim; + } +#endif + return true; + } + + virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE + { + std::vector inputs; + inputs_arr.getMatVector(inputs); + CV_Assert(inputs.size() == 3); + } + + virtual bool supportBackend(int backendId) CV_OVERRIDE + { +#ifdef HAVE_INF_ENGINE + if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH) + return true; +#endif + return backendId == DNN_BACKEND_OPENCV || + backendId == DNN_BACKEND_CUDA || + backendId == DNN_BACKEND_HALIDE || + backendId == DNN_BACKEND_WEBNN; + } + + void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE + { + CV_TRACE_FUNCTION(); + CV_TRACE_ARG_VALUE(name, "name", name.c_str()); + + if (inputs_arr.depth() == CV_16F) + { + forward_fallback(inputs_arr, outputs_arr, internals_arr); + return; + } + + std::vector inputs, outputs; + inputs_arr.getMatVector(inputs); + outputs_arr.getMatVector(outputs); + + CV_Assert_N(outputs.size() == 1, inputs.size() == 3); + + Mat& inpBlob = inputs[1]; + Mat& outBlob = outputs[0]; + // There is a mode when we multiply a first blob by a second one + // instead of trainable weights. + Mat weights = inputs[0].reshape(1, 1); + Mat bias = inputs[2].reshape(1, 1); + + MatShape inpShape0 = shape(inputs[0]); + MatShape inpShape1 = shape(inputs[1]); + MatShape inpShape2 = shape(inputs[2]); + + // TODO: 向こうが想定しているbiasがこちらが想定しているshapeと違う想定っぽいので計算処理を書き直す + // こちらが想定しているの: weights.shape == bias.shape + // 向こうが想定しているの: inpBlob.shape == bias.shape + + MatShape inpShape = shape(inpBlob); + const int numWeights = weights.total(); + CV_Assert(numWeights != 0); + CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs"); + + if (weights.total() == 1) + { + // The total() of bias should be same as weights. + inpBlob.convertTo(outBlob, CV_32F, weights.at(0), bias.at(0)); + return; + } + + int endAxis; + for (endAxis = 1; endAxis <= inpBlob.dims; ++endAxis) + { + if (total(inpShape, 0, endAxis) == numWeights) + break; + } + CV_Assert(total(inpShape, 0, endAxis) == numWeights); + CV_Assert(numWeights == bias.total()); + CV_CheckTypeEQ(inpBlob.type(), CV_32FC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_32FC1, ""); + + int numSlices = total(inpShape, 0, 0); + float* inpData = (float*)inpBlob.data; + float* outData = (float*)outBlob.data; + + if (endAxis != inpBlob.dims) + { + float* weightsData = (float*)weights.data; + float* biasesData = (float*)bias.data; + int spatialSize = total(inpShape, endAxis); // spatialSize != 1 + for (int i = 0; i < numSlices; ++i) + { + for (int j = 0; j < numWeights; ++j) + { + float w = weightsData ? weightsData[j] : 1; + float b = biasesData ? biasesData[j] : 0; + Mat inpSlice(1, spatialSize, CV_32F, inpData); + Mat outSlice(1, spatialSize, CV_32F, outData); + + inpSlice.convertTo(outSlice, CV_32F, w, b); + + inpData += spatialSize; + outData += spatialSize; + } + } + } + else + { + for (int i = 0; i < numSlices; ++i) + { + Mat inpSlice(1, numWeights, CV_32F, inpData); + Mat outSlice(1, numWeights, CV_32F, outData); + + multiply(inpSlice, weights, outSlice); + add(outSlice, bias, outSlice); + + inpData += numWeights; + outData += numWeights; + } + } + } + +#ifdef HAVE_CUDA + Ptr initCUDA( + void* context_, + const std::vector>& inputs, + const std::vector>& outputs + ) override + { + auto context = reinterpret_cast(context_); + + CV_Assert(inputs.size() == 3); + + return make_cuda_node(preferableTarget, std::move(context->stream)); + } +#endif + + virtual Ptr tryAttach(const Ptr& node) CV_OVERRIDE + { + switch (node->backendId) + { + case DNN_BACKEND_HALIDE: + { +#ifdef HAVE_HALIDE + auto base = node.dynamicCast(); + Halide::Func& input = base->funcs.back(); + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = attachHalide(input(x, y, c, n)); + return Ptr(new HalideBackendNode(base, top)); +#endif // HAVE_HALIDE + break; + } + } + return Ptr(); + } + + virtual Ptr initHalide(const std::vector >& inputs) CV_OVERRIDE + { +#ifdef HAVE_HALIDE + Halide::Buffer input = halideBuffer(inputs[0]); + Halide::Var x("x"), y("y"), c("c"), n("n"); + Halide::Func top = attachHalide(input(x, y, c, n)); + return Ptr(new HalideBackendNode(top)); +#endif // HAVE_HALIDE + return Ptr(); + } + +#ifdef HAVE_HALIDE + // attachHalide can work both with Halide::Buffer and Halide::Func. In the + // second case it will be a fusion. + Halide::Func attachHalide(const Halide::Expr& input) + { + Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name)); + Halide::Var x("x"), y("y"), c("c"), n("n"); + + const int numChannels = blobs[0].total(); + + Halide::Expr topExpr = input; + if (hasWeights) + { + auto weights = wrapToHalideBuffer(blobs[0], { numChannels }); + topExpr *= weights(c); + } + if (hasBias) + { + auto bias = wrapToHalideBuffer(blobs.back(), { numChannels }); + topExpr += bias(c); + } + top(x, y, c, n) = topExpr; + return top; + } +#endif // HAVE_HALIDE + + +#ifdef HAVE_DNN_NGRAPH + virtual Ptr initNgraph(const std::vector >& inputs, const std::vector >& nodes) CV_OVERRIDE + { + auto ieInpNode0 = nodes[0].dynamicCast()->node; + ov::Output ieInpNode1; + if (nodes.size() > 1) + ieInpNode1 = nodes[1].dynamicCast()->node; + + size_t numChannels = 1; + if (blobs.empty()) + for (const size_t& dim : ieInpNode1.get_shape()) + numChannels *= dim; + else + numChannels = blobs[0].total(); + + std::vector shape(ieInpNode0.get_shape().size(), 1); + int cAxis = normalize_axis(axis, shape.size()); + shape[cAxis] = numChannels; + + std::shared_ptr node; + if (hasWeights) + { + ov::Output weight = blobs.empty() ? ieInpNode1 : + std::make_shared(ov::element::f32, ov::Shape(shape), blobs[0].data); + node = std::make_shared(ieInpNode0, weight, ov::op::AutoBroadcastType::NUMPY); + } + if (hasBias || !hasWeights) + { + ov::Output bias; + if (hasBias) + { + bias = blobs.empty() ? ieInpNode1 : + std::make_shared(ov::element::f32, + ov::Shape(shape), blobs.back().data); + } + else + bias = std::make_shared(ov::element::f32, + ov::Shape(shape), std::vector(numChannels, 0).data()); + node = std::make_shared(node, bias, ov::op::AutoBroadcastType::NUMPY); + } + return Ptr(new InfEngineNgraphNode(node)); + } +#endif // HAVE_DNN_NGRAPH + +#ifdef HAVE_WEBNN + virtual Ptr initWebnn(const std::vector >& inputs, const std::vector >& nodes) CV_OVERRIDE + { + Ptr node = nodes[0].dynamicCast(); + auto& webnnInpOperand0 = node->operand; + auto& webnnGraphBuilder = node->net->builder; + auto webnnInpOperand1 = nodes.size() > 1 ? nodes[1].dynamicCast()->operand : nullptr; + auto webnnInpOperand2 = nodes.size() > 2 ? nodes[1].dynamicCast()->operand : nullptr; + std::vector shape(dims, 1); + + size_t channels = 1; + if (blobs.empty()) + channels = numChannels; + else + channels = blobs[0].total(); + + int cAxis = normalize_axis(axis, shape.size()); + shape[cAxis] = channels; + + ml::Operand operand = webnnInpOperand0; + if (hasWeights) + { + ml::Operand webnnWeights = blobs.empty() ? webnnInpOperand1 : webnn::BuildConstant(webnnGraphBuilder, webnn::getShape(blobs[0]), blobs[0].data, blobs[0].total() * blobs[0].elemSize(), ml::OperandType::Float32); + webnnWeights = webnnGraphBuilder.Reshape(webnnWeights, shape.data(), shape.size()); + operand = webnnGraphBuilder.Mul(operand, webnnWeights); + } + if (hasBias) + { + ml::Operand webnnBias; + if (!hasWeights) + webnnBias = blobs.empty() ? webnnInpOperand1 : webnn::BuildConstant(webnnGraphBuilder, webnn::getShape(blobs.back()), blobs.back().data, blobs.back().total() * blobs.back().elemSize(), ml::OperandType::Float32); + else + webnnBias = blobs.empty() ? webnnInpOperand2 : webnn::BuildConstant(webnnGraphBuilder, webnn::getShape(blobs.back()), blobs.back().data, blobs.back().total() * blobs.back().elemSize(), ml::OperandType::Float32); + webnnBias = webnnGraphBuilder.Reshape(webnnBias, shape.data(), shape.size()); + operand = webnnGraphBuilder.Add(operand, webnnBias); + } + + return Ptr(new WebnnBackendNode(operand)); + } +#endif + + + void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE + { + scale = Mat(); + shift = Mat(); + } + + //bool tryQuantize(const std::vector >& scales, + // const std::vector >& zeropoints, LayerParams& params) CV_OVERRIDE + //{ + // params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size())); + // params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size())); + // return true; + //} + + virtual int64 getFLOPS(const std::vector& inputs, + const std::vector& outputs) const CV_OVERRIDE + { + CV_UNUSED(outputs); // suppress unused variable warning + long flops = 0; + for (int i = 0; i < inputs.size(); i++) + { + flops += 3 * total(inputs[i]); + } + return flops; + } + + static Ptr create(const LayerParams& params) + { + return Ptr(new AxpyFastLayerImpl(params)); + } + }; + + } // namespace dnn +} // namespace cv + +# include + +void reg2() +{ + CV_DNN_REGISTER_LAYER_CLASS(AxpyFast, cv::dnn::AxpyFastLayerImpl); +} diff --git a/waifu2x-caffe/waifu2x-caffe.vcxproj b/waifu2x-caffe/waifu2x-caffe.vcxproj index 1c0eb7a..2efe32e 100644 --- a/waifu2x-caffe/waifu2x-caffe.vcxproj +++ b/waifu2x-caffe/waifu2x-caffe.vcxproj @@ -104,6 +104,7 @@ true true + true @@ -115,6 +116,7 @@ + diff --git a/waifu2x-caffe/waifu2x-caffe.vcxproj.filters b/waifu2x-caffe/waifu2x-caffe.vcxproj.filters index a2a6820..5407249 100644 --- a/waifu2x-caffe/waifu2x-caffe.vcxproj.filters +++ b/waifu2x-caffe/waifu2x-caffe.vcxproj.filters @@ -36,6 +36,9 @@ 繧ス繝シ繧ケ 繝輔ぃ繧、繝ォ + + 繧ス繝シ繧ケ 繝輔ぃ繧、繝ォ + @@ -47,5 +50,8 @@ common + + 繧ス繝シ繧ケ 繝輔ぃ繧、繝ォ + \ No newline at end of file