gemmlowp/internal/single_thread_gemm.h

*5f39d1b3SJooyung Han// Copyright 2015 The Gemmlowp Authors. All Rights Reserved.
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// Licensed under the Apache License, Version 2.0 (the "License");
*5f39d1b3SJooyung Han// you may not use this file except in compliance with the License.
*5f39d1b3SJooyung Han// You may obtain a copy of the License at
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han//     http://www.apache.org/licenses/LICENSE-2.0
*5f39d1b3SJooyung Han//
*5f39d1b3SJooyung Han// Unless required by applicable law or agreed to in writing, software
*5f39d1b3SJooyung Han// distributed under the License is distributed on an "AS IS" BASIS,
*5f39d1b3SJooyung Han// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
*5f39d1b3SJooyung Han// See the License for the specific language governing permissions and
*5f39d1b3SJooyung Han// limitations under the License.
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han// single_thread_gemm.h: Single-threaded GEMM implementation.
*5f39d1b3SJooyung Han// This is a good place to start reading code, as it shows the overall
*5f39d1b3SJooyung Han// structure of a GEMM and is much simpler than multi_thread_gemm.h.
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#ifndef GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
*5f39d1b3SJooyung Han#define GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#include <cassert>
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#include "../public/map.h"
*5f39d1b3SJooyung Han#include "allocator.h"
*5f39d1b3SJooyung Han#include "compute.h"
*5f39d1b3SJooyung Han#include "kernel.h"
*5f39d1b3SJooyung Han#include "pack.h"
*5f39d1b3SJooyung Han#include "unpack.h"
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#ifdef GEMMLOWP_PROFILING_SIZES
*5f39d1b3SJooyung Han#ifndef GEMMLOWP_PROFILING
*5f39d1b3SJooyung Han#error GEMMLOWP_PROFILING_SIZES without GEMMLOWP_PROFILING
*5f39d1b3SJooyung Han#endif
*5f39d1b3SJooyung Han#include <string>
*5f39d1b3SJooyung Han#include <unordered_map>
*5f39d1b3SJooyung Han#endif
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Hannamespace gemmlowp {
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Hanclass SingleThreadGemmContext {
*5f39d1b3SJooyung Han public:
*5f39d1b3SJooyung Han  Allocator* allocator() { return &allocator_; }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  void set_l1_bytes_to_use(int n) { l1_bytes_to_use_ = n; }
*5f39d1b3SJooyung Han  void set_l2_bytes_to_use(int n) { l2_bytes_to_use_ = n; }
*5f39d1b3SJooyung Han  void set_l2_rhs_factor(float n) { l2_rhs_factor_ = n; }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  int l1_bytes_to_use() const { return l1_bytes_to_use_; }
*5f39d1b3SJooyung Han  int l2_bytes_to_use() const { return l2_bytes_to_use_; }
*5f39d1b3SJooyung Han  float l2_rhs_factor() const { return l2_rhs_factor_; }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han protected:
*5f39d1b3SJooyung Han  Allocator allocator_;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // The cache configurationt to use.
*5f39d1b3SJooyung Han  int l1_bytes_to_use_ = kDefaultL1CacheSize;
*5f39d1b3SJooyung Han  int l2_bytes_to_use_ = kDefaultL2CacheSize;
*5f39d1b3SJooyung Han  float l2_rhs_factor_ = kDefaultL2RhsFactor;
*5f39d1b3SJooyung Han};
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Hantemplate <typename KernelFormat, typename InputScalar, typename OutputScalar,
*5f39d1b3SJooyung Han          typename BitDepthParams, MapOrder LhsOrder, MapOrder RhsOrder,
*5f39d1b3SJooyung Han          MapOrder ResultOrder, typename LhsOffset, typename RhsOffset,
*5f39d1b3SJooyung Han          typename OutputPipelineType>
*5f39d1b3SJooyung Hanvoid SingleThreadGemm(SingleThreadGemmContext* context,
*5f39d1b3SJooyung Han                      const KernelBase& kernel,
*5f39d1b3SJooyung Han                      const MatrixMap<const InputScalar, LhsOrder>& lhs,
*5f39d1b3SJooyung Han                      const MatrixMap<const InputScalar, RhsOrder>& rhs,
*5f39d1b3SJooyung Han                      MatrixMap<OutputScalar, ResultOrder>* result,
*5f39d1b3SJooyung Han                      const LhsOffset& lhs_offset, const RhsOffset& rhs_offset,
*5f39d1b3SJooyung Han                      const OutputPipelineType& output_pipeline) {
*5f39d1b3SJooyung Han  ScopedProfilingLabel label("gemmlowp::SingleThreadGemm");
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  assert(lhs.cols() == rhs.rows());
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  int rows = result->rows();
*5f39d1b3SJooyung Han  int cols = result->cols();
*5f39d1b3SJooyung Han  int depth = lhs.cols();
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // zero sizes should have been caught earlier and early-returned.
*5f39d1b3SJooyung Han  assert(rows > 0);
*5f39d1b3SJooyung Han  assert(cols > 0);
*5f39d1b3SJooyung Han  assert(depth > 0);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  // The case of rows<cols should have been caught earlier and transposed.
*5f39d1b3SJooyung Han  assert(rows >= cols);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  Allocator* allocator = context->allocator();
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  BlockParams block_params;
*5f39d1b3SJooyung Han  block_params.Init<KernelFormat>(
*5f39d1b3SJooyung Han      rows, cols, depth, 1, context->l1_bytes_to_use(),
*5f39d1b3SJooyung Han      context->l2_bytes_to_use(), context->l2_rhs_factor());
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#ifdef GEMMLOWP_PROFILING_SIZES
*5f39d1b3SJooyung Han  // Using a static map of label strings. Not reentrant at all!
*5f39d1b3SJooyung Han  static std::unordered_map<std::uint64_t, std::string> labels_map;
*5f39d1b3SJooyung Han  std::uint64_t sizes_hash = static_cast<std::uint64_t>(rows) ^
*5f39d1b3SJooyung Han                             (static_cast<std::uint64_t>(depth) << 16) ^
*5f39d1b3SJooyung Han                             (static_cast<std::uint64_t>(cols) << 32);
*5f39d1b3SJooyung Han  if (!labels_map.count(sizes_hash)) {
*5f39d1b3SJooyung Han    char label[256];
*5f39d1b3SJooyung Han    snprintf(label, sizeof(label),
*5f39d1b3SJooyung Han             "(rows = %d, depth = %d, cols = %d, l2_rows = %d, l2_depth = %d, "
*5f39d1b3SJooyung Han             "l2_cols = %d, l1_rows = %d, l1_depth = %d, l1_cols = %d)",
*5f39d1b3SJooyung Han             rows, depth, cols, block_params.l2_rows, block_params.l2_depth,
*5f39d1b3SJooyung Han             block_params.l2_cols, block_params.l1_rows, block_params.l1_depth,
*5f39d1b3SJooyung Han             block_params.l1_cols);
*5f39d1b3SJooyung Han    labels_map[sizes_hash] = label;
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han  ScopedProfilingLabel size_label(labels_map[sizes_hash].c_str());
*5f39d1b3SJooyung Han#endif
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  PackedSideBlock<typename KernelFormat::Lhs> packed_lhs(Side::Lhs, allocator,
*5f39d1b3SJooyung Han                                                         block_params);
*5f39d1b3SJooyung Han  PackedSideBlock<typename KernelFormat::Rhs> packed_rhs(Side::Rhs, allocator,
*5f39d1b3SJooyung Han                                                         block_params);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  PackedResult packed_result(allocator, block_params);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  allocator->Commit();
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  const bool pack_rhs_once = block_params.l2_cols >= cols;
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  if (pack_rhs_once) {
*5f39d1b3SJooyung Han    PackRhs(&packed_rhs, rhs);
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  for (int r = 0; r < rows; r += block_params.l2_rows) {
*5f39d1b3SJooyung Han    int rs = std::min(block_params.l2_rows, rows - r);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    PackLhs(&packed_lhs, lhs.block(r, 0, rs, depth));
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han    for (int c = 0; c < cols; c += block_params.l2_cols) {
*5f39d1b3SJooyung Han      int cs = std::min(block_params.l2_cols, cols - c);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han      if (!pack_rhs_once) {
*5f39d1b3SJooyung Han        PackRhs(&packed_rhs, rhs.block(0, c, depth, cs));
*5f39d1b3SJooyung Han      }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han      Compute(kernel, block_params, &packed_result, packed_lhs, packed_rhs,
*5f39d1b3SJooyung Han              depth);
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han      UnpackResult<KernelFormat>(
*5f39d1b3SJooyung Han          result, MatrixBlockBounds(r, c, rs, cs), packed_result, depth,
*5f39d1b3SJooyung Han          packed_lhs.sums_of_each_slice(), packed_rhs.sums_of_each_slice(),
*5f39d1b3SJooyung Han          lhs_offset.block(r, rs), rhs_offset.block(c, cs), output_pipeline);
*5f39d1b3SJooyung Han    }
*5f39d1b3SJooyung Han  }
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han  allocator->Decommit();
*5f39d1b3SJooyung Han}
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han}  // namespace gemmlowp
*5f39d1b3SJooyung Han
*5f39d1b3SJooyung Han#endif  // GEMMLOWP_INTERNAL_SINGLE_THREAD_GEMM_H_