1 #include <c10/util/Logging.h> 2 #include <torch/csrc/distributed/c10d/reducer.hpp> 3 4 #include <utility> 5 6 namespace c10d { 7 8 class TORCH_API Logger { 9 public: 10 explicit Logger(std::shared_ptr<c10d::Reducer> reducer); 11 // Set logging data that can be got during DistributedDataParallel 12 // construction time. 13 void set_construction_data_and_log( 14 const std::string& module_name, 15 const std::vector<int>& device_ids, 16 int output_device, 17 bool broadcast_buffers, 18 bool has_sync_bn, 19 bool static_graph); 20 21 void set_static_graph(); 22 23 // An interface for users to get DDPLoggingData and log them 24 // in the applications. Explanation of logging fields are in 25 // "struct DDPLoggingData" of "torch/c10/util/Logging.h". 26 at::DDPLoggingData get_ddp_logging_data(); 27 28 // Stream insertion operator for logging data to stream under 29 // TORCH_DISTRIBUTED_DEBUG. 30 friend std::ostream& operator<<(std::ostream& output, const Logger& logger); 31 ~Logger()32 ~Logger() noexcept(false) { 33 // Log if DDP graph is static in Logger dtor instead of Reducer dtor since 34 // Logger is deleted before Reducer. 35 log_if_graph_static(reducer_->ddp_graph_static()); 36 } 37 38 // Set environment variables. 39 void set_env_variables(); 40 // Set parameters stats. 41 void set_parameter_stats(); 42 // Get size of each bucket (Bytes). 43 std::vector<int64_t> get_bucket_sizes(); 44 // Get variable indices for each bucket. 45 std::vector<std::vector<size_t>> get_per_bucket_variable_indices(); 46 // Set comm. hook, if used 47 void set_comm_hook(const std::string& hook); 48 // Set running with uneven input detection (model.join() context manager) 49 void set_uneven_input_join(); 50 51 // Reset performance stats at current iteration 52 void reset_performance_stats(); 53 54 // Calculate avg stats using cpu timer and gpu timer 55 // that has been recorded in reducer. 56 void calculate_avg_time( 57 int64_t& avg_time, 58 int64_t& time_duration, 59 Timer& timer, 60 Timer::Event start_event, 61 Timer::Event end_event); 62 63 // Set the absolute time of the event that has been recorded in reducer. 64 void set_event_time(int64_t& event_time, Timer& timer, Timer::Event event); 65 // Set stats that can be collected only during 66 // training loop. It is called at the beginning of forward call 67 // to record the run time stats of sampled iterations that previously ran. 68 // GPU performance stats are collected only for single process 69 // single device program and single device module right now. 70 // TODO to support single process multiple devices and multi device modules, 71 // events need to be created and recorded on multiple devices. 72 void set_runtime_stats_and_log(); 73 74 // Called when DDP/reducer is failing with an error. The 75 // logging data structure will have two fields filled: "has_error" indicating 76 // that this iteration encountered an error and other fields are not valid, 77 // and "error", a string which contains the error message that DDP failed 78 // with. 79 template <typename... Args> set_error_and_log(const std::string & ddp_error,const Args &...args)80 void set_error_and_log(const std::string& ddp_error, const Args&... args) { 81 ddp_logging_data_->ints_map["has_error"] = 1; 82 auto err = c10::str(ddp_error, args...); 83 ddp_logging_data_->strs_map["error"] = err; 84 // Report the iteration we are erroring at so user knows how many examples 85 // successfully processed before this error was hit. 86 ddp_logging_data_->ints_map["iteration"] = reducer_->num_iterations_; 87 at::LogPyTorchDDPUsage(*ddp_logging_data_); 88 } 89 90 // When running without static graph, called when reducer is destroyed to log 91 // if graph was actually static and is a candidate for static graph 92 // optimization. 93 void log_if_graph_static(bool is_static); 94 95 private: 96 // ddp_logging_data_ is used to hold all the ddp related logging 97 // data fields. 98 std::unique_ptr<at::DDPLoggingData> ddp_logging_data_; 99 std::shared_ptr<c10d::Reducer> reducer_; 100 // track the number of iterations when runtime stats are collected so far. 101 long num_iterations_stats_recorded_ = 0; 102 }; 103 104 // a generic logging data struct that holds different types of logging data. 105 // starting with key value pairs of strings and integers, 106 // It can be extended to more types as needed. 107 struct C10dLoggingData { 108 // logging fields that are string types. 109 std::map<std::string, std::string> strings; 110 // logging fields that are int64_t types. 111 std::map<std::string, int64_t> integers; 112 }; 113 114 class TORCH_API C10dLogger { 115 public: 116 C10dLogger(const C10dLogger&) = default; 117 C10dLogger(C10dLogger&&) = delete; 118 C10dLogger& operator=(const C10dLogger&) = default; 119 C10dLogger& operator=(C10dLogger&&) = delete; 120 virtual ~C10dLogger() = default; 121 virtual void log(const C10dLoggingData& data); 122 static C10dLogger* getLogger(); 123 static void registerLogger(std::unique_ptr<C10dLogger>); 124 125 protected: 126 // singletion, hide constructor from the public C10dLogger(std::string logDestination)127 C10dLogger(std::string logDestination) 128 : logDestination_(std::move(logDestination)) {} 129 130 // the name of the destination this logger should log to 131 std::string logDestination_; 132 133 private: 134 static std::unique_ptr<C10dLogger> logger_; 135 static std::atomic<bool> registered_; 136 }; 137 138 } // namespace c10d 139