diff --git a/bin/models/upresnet10/noise0_scale2.0x_model.prototxt b/bin/models/upresnet10/noise0_scale2.0x_model.prototxt
index c083cc7..9def1bf 100644
--- a/bin/models/upresnet10/noise0_scale2.0x_model.prototxt
+++ b/bin/models/upresnet10/noise0_scale2.0x_model.prototxt
@@ -196,7 +196,7 @@ layer {
 }
 layer {
   name: "/res1/axpy"
-  type: "Axpy"
+  type: "AxpyFast"
   bottom: "/res1/fc2_sigmoid"
   bottom: "/res1/conv2_relu"
   bottom: "/res1/crop"
@@ -353,7 +353,7 @@ layer {
 }
 layer {
   name: "/res2/axpy"
-  type: "Axpy"
+  type: "AxpyFast"
   bottom: "/res2/fc2_sigmoid"
   bottom: "/res2/conv2_relu"
   bottom: "/res2/crop"
@@ -510,7 +510,7 @@ layer {
 }
 layer {
   name: "/res3/axpy"
-  type: "Axpy"
+  type: "AxpyFast"
   bottom: "/res3/fc2_sigmoid"
   bottom: "/res3/conv2_relu"
   bottom: "/res3/crop"
@@ -667,7 +667,7 @@ layer {
 }
 layer {
   name: "/res4/axpy"
-  type: "Axpy"
+  type: "AxpyFast"
   bottom: "/res4/fc2_sigmoid"
   bottom: "/res4/conv2_relu"
   bottom: "/res4/crop"
@@ -824,7 +824,7 @@ layer {
 }
 layer {
   name: "/res5/axpy"
-  type: "Axpy"
+  type: "AxpyFast"
   bottom: "/res5/fc2_sigmoid"
   bottom: "/res5/conv2_relu"
   bottom: "/res5/crop"
diff --git a/waifu2x-caffe/Test.cpp b/waifu2x-caffe/Test.cpp
index ac559cc..1d07495 100644
--- a/waifu2x-caffe/Test.cpp
+++ b/waifu2x-caffe/Test.cpp
@@ -1,5 +1,6 @@
 # include <iostream>
 # include <fstream>
+# include <iomanip>
 # include <opencv2/dnn.hpp>
 # include <opencv2/imgproc.hpp>
 # include <opencv2/imgcodecs.hpp>
@@ -280,18 +281,98 @@ public:
 	}
 };
 
+static double sumAllElements(const cv::Mat& mat)
+{
+	CV_Assert(!mat.empty());
+
+	const cv::Scalar s = cv::sum(mat); // チャンネルごとの合計
+	double total = 0.0;
+	for (int c = 0; c < mat.channels(); ++c) {
+		total += s[c];
+	}
+	return total;
+}
+
+
+// ---- 内部実装 ----
+template<typename T>
+static void printRec(const cv::Mat& m, std::vector<int>& idx, int d) {
+	if (d == m.dims - 1) {
+		// 最終軸：一次元の並びを出力
+		const int cn = m.channels();
+		std::cout << "[";
+		for (int i = 0; i < m.size[d]; ++i) {
+			idx[d] = i;
+			const T* p = m.ptr<T>(idx.data()); // idx の位置の要素先頭（ch=0）へのポインタ
+			if (cn == 1) {
+				std::cout << p[0];
+			}
+			else {
+				std::cout << "(";
+				for (int c = 0; c < cn; ++c) {
+					std::cout << p[c];
+					if (c + 1 < cn) std::cout << ", ";
+				}
+				std::cout << ")";
+			}
+			if (i + 1 < m.size[d]) std::cout << ", ";
+		}
+		std::cout << "]";
+	}
+	else {
+		// 途中軸：再帰で内側へ
+		std::cout << "[";
+		for (int i = 0; i < m.size[d]; ++i) {
+			idx[d] = i;
+			printRec<T>(m, idx, d + 1);
+			if (i + 1 < m.size[d]) std::cout << ",\n";
+		}
+		std::cout << "]";
+	}
+}
+
+template<typename T>
+static void printMatND_T(const cv::Mat& m) {
+	// 浮動小数は小数桁を控えめに
+	if (std::is_floating_point<T>::value) {
+		std::cout << std::fixed << std::setprecision(6);
+	}
+	std::vector<int> idx(m.dims, 0);
+	printRec<T>(m, idx, 0);
+	std::cout << std::endl;
+}
+
+// エントリポイント（cv::Mat の depth に応じてディスパッチ）
+static void printMatND(const cv::Mat& m) {
+	switch (m.depth()) {
+	case CV_8U:  printMatND_T<uchar>(m);   break;
+	case CV_8S:  printMatND_T<schar>(m);   break;
+	case CV_16U: printMatND_T<uint16_t>(m); break;
+	case CV_16S: printMatND_T<int16_t>(m); break;
+	case CV_32S: printMatND_T<int32_t>(m); break;
+	case CV_32F: printMatND_T<float>(m);   break;
+	case CV_64F: printMatND_T<double>(m);  break;
+	default:
+		throw std::runtime_error("Unsupported Mat depth.");
+	}
+}
+
+
 void reg();
+void reg2();
 
 int main(int argc, char** argv) {
 	//CV_DNN_REGISTER_LAYER_CLASS(CropCenter, CropCenterLayer);
 	reg();
+	reg2();
 
 	// ImageNet Caffeリファレンスモデル
 	string protoFile = "models/upresnet10/noise0_scale2.0x_model.prototxt";
 	string modelFile = "models/upresnet10/noise0_scale2.0x_model.json.caffemodel";
 
 	// 画像ファイル
-	string imageFile = (argc > 1) ? argv[1] : "images/cat.jpg";
+	//string imageFile = (argc > 1) ? argv[1] : "images/cat.jpg";
+	string imageFile = "red.png";
 
 	// Caffeモデルの読み込み
 	cv::dnn::Net net;
@@ -321,25 +402,37 @@ int main(int argc, char** argv) {
 		cv::resize(img, img, cv::Size(cropSize, cropSize));
 		// Caffeで扱うBlob形式に変換 (実体はcv::Matのラッパークラス)
 		const auto inputBlob = cv::dnn::blobFromImage(img, 1.0 / 255.0, cv::Size(), cv::Scalar(), true, false, CV_32F);
+
+		//printMatND(inputBlob);
+
 		std::vector<int> indim(inputBlob.size.p, inputBlob.size.p + inputBlob.size.dims());
 		// 入力層に画像を入力
-		net.setInput(inputBlob);
+		net.setInput(inputBlob, "input");
 		// フォワードパス(順伝播)の計算&出力層(Softmax)の出力を取得, ここに予測結果が格納されている
 		// ImageNet 1000クラス毎の確率(32bits浮動小数点値)が格納された1x1000の行列(ベクトル)
-		const auto probMat = net.forward();
+		//const auto probMat = net.forward("/conv_post");
+		const auto probMat = net.forward("/res1/axpy");
+
+		std::vector<int> probMatDim(probMat.size.p, probMat.size.p + probMat.size.dims());
+		auto sss = sumAllElements(probMat);
+		//printMatND(probMat);
 
 		std::vector<cv::Mat> outImgs;
 		cv::dnn::imagesFromBlob(probMat, outImgs);
 		//cv::dnn::imagesFromBlob(inputBlob, outImgs);
 		auto outImg = outImgs[0];
 
+		std::vector<int> outdim(outImg.size.p, outImg.size.p + outImg.size.dims());
+		printMatND(outImg);
+
+		//std::cout << cv::format(outImg, cv::Formatter::FMT_DEFAULT) << std::endl;
+
 		// 値を0～1にクリッピング
 		cv::threshold(outImg, outImg, 1.0, 1.0, cv::THRESH_TRUNC);
 		cv::threshold(outImg, outImg, 0.0, 0.0, cv::THRESH_TOZERO);
 
 		const double clip_eps8 = (1.0 / 255.0) * 0.5 - (1.0e-7 * (1.0 / 255.0) * 0.5);
 		outImg.convertTo(outImg, CV_8U, 255.0, clip_eps8);
-		std::vector<int> outdim(outImg.size.p, outImg.size.p + outImg.size.dims());
 
 		cv::cvtColor(outImg, outImg, cv::COLOR_RGB2BGR);
 
diff --git a/waifu2x-caffe/axpy.hpp b/waifu2x-caffe/axpy.hpp
new file mode 100644
index 0000000..ec86d83
--- /dev/null
+++ b/waifu2x-caffe/axpy.hpp
@@ -0,0 +1,106 @@
+// This file is part of OpenCV project.
+// It is subject to the license terms in the LICENSE file found in the top-level directory
+// of this distribution and at http://opencv.org/license.html.
+
+#ifndef OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP
+#define OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP
+
+//#include "../../op_cuda.hpp"
+//
+//#include "../csl/stream.hpp"
+//#include "../csl/tensor.hpp"
+//
+//#include "../kernels/scale_shift.hpp"
+
+#include <op_cuda.hpp>
+
+#include <opencv2/core.hpp>
+
+#include <cuda4dnn/kernels/scale_shift.hpp>
+
+#include <cstddef>
+#include <utility>
+
+namespace cv {
+	namespace dnn {
+		namespace cuda4dnn {
+
+			template <class T>
+			class AxpyOp final : public CUDABackendNode {
+			public:
+				using wrapper_type = GetCUDABackendWrapperType<T>;
+
+				AxpyOp(csl::Stream stream_)
+					: stream(std::move(stream_)), axis(0)
+				{
+				}
+
+				void forward(
+					const std::vector<cv::Ptr<BackendWrapper>>& inputs,
+					const std::vector<cv::Ptr<BackendWrapper>>& outputs,
+					csl::Workspace& workspace) override
+				{
+					CV_Assert(inputs.size() == 3);
+					CV_Assert(outputs.size() == 1);
+
+					auto input_wrapper = inputs[1].dynamicCast<wrapper_type>();
+					auto input = input_wrapper->getView();
+
+					auto output_wrapper = outputs[0].dynamicCast<wrapper_type>();
+					auto output = output_wrapper->getSpan();
+
+					/* number of batches in the weights/bias
+					 * trainable mode: same for all batches
+					 * untrainable mode: could be different for different batch samples
+					 */
+					std::size_t parameter_batch_size = 1;
+
+					csl::TensorView<T> weights = inputs[0].dynamicCast<wrapper_type>()->getView();
+					parameter_batch_size = weights.get_axis_size(0);
+					CV_Assert(parameter_batch_size == input.get_axis_size(0));
+
+					csl::TensorView<T> bias = inputs[2].dynamicCast<wrapper_type>()->getView();
+					parameter_batch_size = bias.get_axis_size(0);
+					CV_Assert(parameter_batch_size == input.get_axis_size(0));
+
+					CV_Assert(!weights.empty() || !bias.empty());
+					if (!weights.empty() && !bias.empty())
+					{
+						CV_CheckEQ(weights.size(), bias.size(), "different broadcasting options for weights and bias is not supported");
+					}
+
+					const auto num_parameters = !weights.empty() ? weights.size() : bias.size();
+					const auto mid_size = num_parameters / parameter_batch_size;
+
+					/* the scale shift operation might require broadcasting */
+					const int end_axis = [&] {
+						if (num_parameters == 1) {
+							return static_cast<int>(axis + 1);
+						}
+						for (int endAxis = axis + 1; endAxis <= input.rank(); endAxis++) {
+							if (input.size_range(axis, endAxis) == mid_size)
+								return endAxis;
+						}
+						CV_Assert(0 /* failed to find a broadcast config */);
+						}();
+
+					std::size_t inner_size = input.size_range(end_axis, input.rank());
+
+					if (!weights.empty() && !bias.empty())
+						kernels::scaleN_with_biasN<T>(stream, output, input, inner_size, weights, bias);
+					else if (!weights.empty())
+						kernels::scaleN<T>(stream, output, input, inner_size, weights);
+					else
+						kernels::biasN<T>(stream, output, input, inner_size, bias);
+				}
+
+			private:
+				csl::Stream stream;
+				std::size_t axis;
+			};
+
+		}
+	}
+} /* namespace cv::dnn::cuda4dnn */
+
+#endif /* OPENCV_DNN_SRC_CUDA4DNN_PRIMITIVES_SCALE_SHIFT_HPP */
diff --git a/waifu2x-caffe/axpy_fast_layer.cpp b/waifu2x-caffe/axpy_fast_layer.cpp
new file mode 100644
index 0000000..f2546e3
--- /dev/null
+++ b/waifu2x-caffe/axpy_fast_layer.cpp
@@ -0,0 +1,371 @@
+#include <opencv2/dnn/all_layers.hpp>
+//#include <opencv2/core/opencl/ocl_defs.hpp>
+#include <cvconfig.h>
+
+//#include <layers/layers_common.hpp>
+#include <op_cuda.hpp>
+#include <op_halide.hpp>
+#include <op_inf_engine.hpp>
+#include <ie_ngraph.hpp>
+#include <op_webnn.hpp>
+
+#include <opencv2/imgproc.hpp>
+#include <opencv2/dnn/shape_utils.hpp>
+
+#include <opencv2/core/utils/logger.hpp>
+#include <opencv2/core/utils/trace.hpp>
+
+#ifdef HAVE_CUDA
+//#include <cuda4dnn/primitives/scale_shift.hpp>
+#include "axpy.hpp"
+using namespace cv::dnn::cuda4dnn;
+#endif
+
+namespace cv
+{
+	namespace dnn
+	{
+
+		class AxpyFastLayerImpl CV_FINAL : public Layer
+		{
+		public:
+#ifdef HAVE_WEBNN
+			mutable int dims;
+			mutable int numChannels;
+#endif
+			AxpyFastLayerImpl(const LayerParams& params)
+			{
+				setParamsFrom(params);
+			}
+
+			bool getMemoryShapes(const std::vector<MatShape>& inputs,
+				const int requiredOutputs,
+				std::vector<MatShape>& outputs,
+				std::vector<MatShape>& internals) const CV_OVERRIDE
+			{
+				outputs.assign(1, inputs[1]);
+#ifdef HAVE_WEBNN
+				dims = inputs[0].size();
+				numChannels = 1;
+				if (inputs.size() > 1)
+				{
+					for (const size_t& dim : inputs[1])
+						numChannels *= dim;
+				}
+#endif
+				return true;
+			}
+
+			virtual void finalize(InputArrayOfArrays inputs_arr, OutputArrayOfArrays) CV_OVERRIDE
+			{
+				std::vector<Mat> inputs;
+				inputs_arr.getMatVector(inputs);
+				CV_Assert(inputs.size() == 3);
+			}
+
+			virtual bool supportBackend(int backendId) CV_OVERRIDE
+			{
+#ifdef HAVE_INF_ENGINE
+				if (backendId == DNN_BACKEND_INFERENCE_ENGINE_NGRAPH)
+					return true;
+#endif
+				return backendId == DNN_BACKEND_OPENCV ||
+					backendId == DNN_BACKEND_CUDA ||
+					backendId == DNN_BACKEND_HALIDE ||
+					backendId == DNN_BACKEND_WEBNN;
+			}
+
+			void forward(InputArrayOfArrays inputs_arr, OutputArrayOfArrays outputs_arr, OutputArrayOfArrays internals_arr) CV_OVERRIDE
+			{
+				CV_TRACE_FUNCTION();
+				CV_TRACE_ARG_VALUE(name, "name", name.c_str());
+
+				if (inputs_arr.depth() == CV_16F)
+				{
+					forward_fallback(inputs_arr, outputs_arr, internals_arr);
+					return;
+				}
+
+				std::vector<Mat> inputs, outputs;
+				inputs_arr.getMatVector(inputs);
+				outputs_arr.getMatVector(outputs);
+
+				CV_Assert_N(outputs.size() == 1, inputs.size() == 3);
+
+				Mat& inpBlob = inputs[1];
+				Mat& outBlob = outputs[0];
+				// There is a mode when we multiply a first blob by a second one
+				// instead of trainable weights.
+				Mat weights = inputs[0].reshape(1, 1);
+				Mat bias = inputs[2].reshape(1, 1);
+
+				MatShape inpShape0 = shape(inputs[0]);
+				MatShape inpShape1 = shape(inputs[1]);
+				MatShape inpShape2 = shape(inputs[2]);
+
+				// TODO: 向こうが想定しているbiasがこちらが想定しているshapeと違う想定っぽいので計算処理を書き直す
+				// こちらが想定しているの: weights.shape == bias.shape
+				// 向こうが想定しているの: inpBlob.shape == bias.shape
+
+				MatShape inpShape = shape(inpBlob);
+				const int numWeights = weights.total();
+				CV_Assert(numWeights != 0);
+				CV_CheckEQ(weights.total(), bias.total(), "Incompatible weights/bias blobs");
+
+				if (weights.total() == 1)
+				{
+					// The total() of bias should be same as weights.
+					inpBlob.convertTo(outBlob, CV_32F, weights.at<float>(0), bias.at<float>(0));
+					return;
+				}
+
+				int endAxis;
+				for (endAxis = 1; endAxis <= inpBlob.dims; ++endAxis)
+				{
+					if (total(inpShape, 0, endAxis) == numWeights)
+						break;
+				}
+				CV_Assert(total(inpShape, 0, endAxis) == numWeights);
+				CV_Assert(numWeights == bias.total());
+				CV_CheckTypeEQ(inpBlob.type(), CV_32FC1, ""); CV_CheckTypeEQ(outBlob.type(), CV_32FC1, "");
+
+				int numSlices = total(inpShape, 0, 0);
+				float* inpData = (float*)inpBlob.data;
+				float* outData = (float*)outBlob.data;
+
+				if (endAxis != inpBlob.dims)
+				{
+					float* weightsData = (float*)weights.data;
+					float* biasesData = (float*)bias.data;
+					int spatialSize = total(inpShape, endAxis);  // spatialSize != 1
+					for (int i = 0; i < numSlices; ++i)
+					{
+						for (int j = 0; j < numWeights; ++j)
+						{
+							float w = weightsData ? weightsData[j] : 1;
+							float b = biasesData ? biasesData[j] : 0;
+							Mat inpSlice(1, spatialSize, CV_32F, inpData);
+							Mat outSlice(1, spatialSize, CV_32F, outData);
+
+							inpSlice.convertTo(outSlice, CV_32F, w, b);
+
+							inpData += spatialSize;
+							outData += spatialSize;
+						}
+					}
+				}
+				else
+				{
+					for (int i = 0; i < numSlices; ++i)
+					{
+						Mat inpSlice(1, numWeights, CV_32F, inpData);
+						Mat outSlice(1, numWeights, CV_32F, outData);
+
+						multiply(inpSlice, weights, outSlice);
+						add(outSlice, bias, outSlice);
+
+						inpData += numWeights;
+						outData += numWeights;
+					}
+				}
+			}
+
+#ifdef HAVE_CUDA
+			Ptr<BackendNode> initCUDA(
+				void* context_,
+				const std::vector<Ptr<BackendWrapper>>& inputs,
+				const std::vector<Ptr<BackendWrapper>>& outputs
+			) override
+			{
+				auto context = reinterpret_cast<csl::CSLContext*>(context_);
+
+				CV_Assert(inputs.size() == 3);
+
+				return make_cuda_node<cuda4dnn::AxpyOp>(preferableTarget, std::move(context->stream));
+			}
+#endif
+
+			virtual Ptr<BackendNode> tryAttach(const Ptr<BackendNode>& node) CV_OVERRIDE
+			{
+				switch (node->backendId)
+				{
+				case DNN_BACKEND_HALIDE:
+				{
+#ifdef HAVE_HALIDE
+					auto base = node.dynamicCast<HalideBackendNode>();
+					Halide::Func& input = base->funcs.back();
+					Halide::Var x("x"), y("y"), c("c"), n("n");
+					Halide::Func top = attachHalide(input(x, y, c, n));
+					return Ptr<BackendNode>(new HalideBackendNode(base, top));
+#endif  // HAVE_HALIDE
+					break;
+				}
+				}
+				return Ptr<BackendNode>();
+			}
+
+			virtual Ptr<BackendNode> initHalide(const std::vector<Ptr<BackendWrapper> >& inputs) CV_OVERRIDE
+			{
+#ifdef HAVE_HALIDE
+				Halide::Buffer<float> input = halideBuffer(inputs[0]);
+				Halide::Var x("x"), y("y"), c("c"), n("n");
+				Halide::Func top = attachHalide(input(x, y, c, n));
+				return Ptr<BackendNode>(new HalideBackendNode(top));
+#endif  // HAVE_HALIDE
+				return Ptr<BackendNode>();
+			}
+
+#ifdef HAVE_HALIDE
+			// attachHalide can work both with Halide::Buffer and Halide::Func. In the
+			// second case it will be a fusion.
+			Halide::Func attachHalide(const Halide::Expr& input)
+			{
+				Halide::Func top = (name.empty() ? Halide::Func() : Halide::Func(name));
+				Halide::Var x("x"), y("y"), c("c"), n("n");
+
+				const int numChannels = blobs[0].total();
+
+				Halide::Expr topExpr = input;
+				if (hasWeights)
+				{
+					auto weights = wrapToHalideBuffer(blobs[0], { numChannels });
+					topExpr *= weights(c);
+				}
+				if (hasBias)
+				{
+					auto bias = wrapToHalideBuffer(blobs.back(), { numChannels });
+					topExpr += bias(c);
+				}
+				top(x, y, c, n) = topExpr;
+				return top;
+			}
+#endif  // HAVE_HALIDE
+
+
+#ifdef HAVE_DNN_NGRAPH
+			virtual Ptr<BackendNode> initNgraph(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+			{
+				auto ieInpNode0 = nodes[0].dynamicCast<InfEngineNgraphNode>()->node;
+				ov::Output<ov::Node> ieInpNode1;
+				if (nodes.size() > 1)
+					ieInpNode1 = nodes[1].dynamicCast<InfEngineNgraphNode>()->node;
+
+				size_t numChannels = 1;
+				if (blobs.empty())
+					for (const size_t& dim : ieInpNode1.get_shape())
+						numChannels *= dim;
+				else
+					numChannels = blobs[0].total();
+
+				std::vector<size_t> shape(ieInpNode0.get_shape().size(), 1);
+				int cAxis = normalize_axis(axis, shape.size());
+				shape[cAxis] = numChannels;
+
+				std::shared_ptr<ov::Node> node;
+				if (hasWeights)
+				{
+					ov::Output<ov::Node> weight = blobs.empty() ? ieInpNode1 :
+						std::make_shared<ov::op::v0::Constant>(ov::element::f32, ov::Shape(shape), blobs[0].data);
+					node = std::make_shared<ov::op::v1::Multiply>(ieInpNode0, weight, ov::op::AutoBroadcastType::NUMPY);
+				}
+				if (hasBias || !hasWeights)
+				{
+					ov::Output<ov::Node> bias;
+					if (hasBias)
+					{
+						bias = blobs.empty() ? ieInpNode1 :
+							std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+								ov::Shape(shape), blobs.back().data);
+					}
+					else
+						bias = std::make_shared<ov::op::v0::Constant>(ov::element::f32,
+							ov::Shape(shape), std::vector<float>(numChannels, 0).data());
+					node = std::make_shared<ov::op::v1::Add>(node, bias, ov::op::AutoBroadcastType::NUMPY);
+				}
+				return Ptr<BackendNode>(new InfEngineNgraphNode(node));
+			}
+#endif  // HAVE_DNN_NGRAPH
+
+#ifdef HAVE_WEBNN
+			virtual Ptr<BackendNode> initWebnn(const std::vector<Ptr<BackendWrapper> >& inputs, const std::vector<Ptr<BackendNode> >& nodes) CV_OVERRIDE
+			{
+				Ptr<WebnnBackendNode> node = nodes[0].dynamicCast<WebnnBackendNode>();
+				auto& webnnInpOperand0 = node->operand;
+				auto& webnnGraphBuilder = node->net->builder;
+				auto webnnInpOperand1 = nodes.size() > 1 ? nodes[1].dynamicCast<WebnnBackendNode>()->operand : nullptr;
+				auto webnnInpOperand2 = nodes.size() > 2 ? nodes[1].dynamicCast<WebnnBackendNode>()->operand : nullptr;
+				std::vector<int32_t> shape(dims, 1);
+
+				size_t channels = 1;
+				if (blobs.empty())
+					channels = numChannels;
+				else
+					channels = blobs[0].total();
+
+				int cAxis = normalize_axis(axis, shape.size());
+				shape[cAxis] = channels;
+
+				ml::Operand operand = webnnInpOperand0;
+				if (hasWeights)
+				{
+					ml::Operand webnnWeights = blobs.empty() ? webnnInpOperand1 : webnn::BuildConstant(webnnGraphBuilder, webnn::getShape(blobs[0]), blobs[0].data, blobs[0].total() * blobs[0].elemSize(), ml::OperandType::Float32);
+					webnnWeights = webnnGraphBuilder.Reshape(webnnWeights, shape.data(), shape.size());
+					operand = webnnGraphBuilder.Mul(operand, webnnWeights);
+				}
+				if (hasBias)
+				{
+					ml::Operand webnnBias;
+					if (!hasWeights)
+						webnnBias = blobs.empty() ? webnnInpOperand1 : webnn::BuildConstant(webnnGraphBuilder, webnn::getShape(blobs.back()), blobs.back().data, blobs.back().total() * blobs.back().elemSize(), ml::OperandType::Float32);
+					else
+						webnnBias = blobs.empty() ? webnnInpOperand2 : webnn::BuildConstant(webnnGraphBuilder, webnn::getShape(blobs.back()), blobs.back().data, blobs.back().total() * blobs.back().elemSize(), ml::OperandType::Float32);
+					webnnBias = webnnGraphBuilder.Reshape(webnnBias, shape.data(), shape.size());
+					operand = webnnGraphBuilder.Add(operand, webnnBias);
+				}
+
+				return Ptr<BackendNode>(new WebnnBackendNode(operand));
+			}
+#endif
+
+
+			void getScaleShift(Mat& scale, Mat& shift) const CV_OVERRIDE
+			{
+				scale = Mat();
+				shift = Mat();
+			}
+
+			//bool tryQuantize(const std::vector<std::vector<float> >& scales,
+			//	const std::vector<std::vector<int> >& zeropoints, LayerParams& params) CV_OVERRIDE
+			//{
+			//	params.set("input_scales", DictValue::arrayReal(scales[0].data(), scales[0].size()));
+			//	params.set("input_zeropoints", DictValue::arrayInt(zeropoints[0].data(), zeropoints[0].size()));
+			//	return true;
+			//}
+
+			virtual int64 getFLOPS(const std::vector<MatShape>& inputs,
+				const std::vector<MatShape>& outputs) const CV_OVERRIDE
+			{
+				CV_UNUSED(outputs); // suppress unused variable warning
+				long flops = 0;
+				for (int i = 0; i < inputs.size(); i++)
+				{
+					flops += 3 * total(inputs[i]);
+				}
+				return flops;
+			}
+
+			static Ptr<AxpyFastLayerImpl> create(const LayerParams& params)
+			{
+				return Ptr<AxpyFastLayerImpl>(new AxpyFastLayerImpl(params));
+			}
+		};
+
+	}  // namespace dnn
+}  // namespace cv
+
+# include <opencv2/dnn/layer.details.hpp>
+
+void reg2()
+{
+	CV_DNN_REGISTER_LAYER_CLASS(AxpyFast, cv::dnn::AxpyFastLayerImpl);
+}
diff --git a/waifu2x-caffe/waifu2x-caffe.vcxproj b/waifu2x-caffe/waifu2x-caffe.vcxproj
index 1c0eb7a..2efe32e 100644
--- a/waifu2x-caffe/waifu2x-caffe.vcxproj
+++ b/waifu2x-caffe/waifu2x-caffe.vcxproj
@@ -104,6 +104,7 @@
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Release|x64'">true</ExcludedFromBuild>
     </ClCompile>
+    <ClCompile Include="axpy_fast_layer.cpp" />
     <ClCompile Include="slice_layer.cpp" />
     <ClCompile Include="Source.cpp">
       <ExcludedFromBuild Condition="'$(Configuration)|$(Platform)'=='Debug|x64'">true</ExcludedFromBuild>
@@ -115,6 +116,7 @@
     <ClInclude Include="..\common\cNet.h" />
     <ClInclude Include="..\common\stImage.h" />
     <ClInclude Include="..\common\waifu2x.h" />
+    <ClInclude Include="axpy.hpp" />
   </ItemGroup>
   <Import Project="$(VCTargetsPath)\Microsoft.Cpp.targets" />
   <ImportGroup Label="ExtensionTargets">
diff --git a/waifu2x-caffe/waifu2x-caffe.vcxproj.filters b/waifu2x-caffe/waifu2x-caffe.vcxproj.filters
index a2a6820..5407249 100644
--- a/waifu2x-caffe/waifu2x-caffe.vcxproj.filters
+++ b/waifu2x-caffe/waifu2x-caffe.vcxproj.filters
@@ -36,6 +36,9 @@
     <ClCompile Include="slice_layer.cpp">
       <Filter>繧ｽ繝ｼ繧ｹ 繝輔ぃ繧､繝ｫ</Filter>
     </ClCompile>
+    <ClCompile Include="axpy_fast_layer.cpp">
+      <Filter>繧ｽ繝ｼ繧ｹ 繝輔ぃ繧､繝ｫ</Filter>
+    </ClCompile>
   </ItemGroup>
   <ItemGroup>
     <ClInclude Include="..\common\waifu2x.h">
@@ -47,5 +50,8 @@
     <ClInclude Include="..\common\stImage.h">
       <Filter>common</Filter>
     </ClInclude>
+    <ClInclude Include="axpy.hpp">
+      <Filter>繧ｽ繝ｼ繧ｹ 繝輔ぃ繧､繝ｫ</Filter>
+    </ClInclude>
   </ItemGroup>
 </Project>
\ No newline at end of file