xref: /aosp_15_r20/external/pytorch/torch/csrc/distributed/c10d/logger.hpp (revision da0073e96a02ea20f0ac840b70461e3646d07c45)
1 #include <c10/util/Logging.h>
2 #include <torch/csrc/distributed/c10d/reducer.hpp>
3 
4 #include <utility>
5 
6 namespace c10d {
7 
8 class TORCH_API Logger {
9  public:
10   explicit Logger(std::shared_ptr<c10d::Reducer> reducer);
11   // Set logging data that can be got during DistributedDataParallel
12   // construction time.
13   void set_construction_data_and_log(
14       const std::string& module_name,
15       const std::vector<int>& device_ids,
16       int output_device,
17       bool broadcast_buffers,
18       bool has_sync_bn,
19       bool static_graph);
20 
21   void set_static_graph();
22 
23   // An interface for users to get DDPLoggingData and log them
24   // in the applications. Explanation of logging fields are in
25   // "struct DDPLoggingData" of "torch/c10/util/Logging.h".
26   at::DDPLoggingData get_ddp_logging_data();
27 
28   // Stream insertion operator for logging data to stream under
29   // TORCH_DISTRIBUTED_DEBUG.
30   friend std::ostream& operator<<(std::ostream& output, const Logger& logger);
31 
~Logger()32   ~Logger() noexcept(false) {
33     // Log if DDP graph is static in Logger dtor instead of Reducer dtor since
34     // Logger is deleted before Reducer.
35     log_if_graph_static(reducer_->ddp_graph_static());
36   }
37 
38   // Set environment variables.
39   void set_env_variables();
40   // Set parameters stats.
41   void set_parameter_stats();
42   // Get size of each bucket (Bytes).
43   std::vector<int64_t> get_bucket_sizes();
44   // Get variable indices for each bucket.
45   std::vector<std::vector<size_t>> get_per_bucket_variable_indices();
46   // Set comm. hook, if used
47   void set_comm_hook(const std::string& hook);
48   // Set running with uneven input detection (model.join() context manager)
49   void set_uneven_input_join();
50 
51   // Reset performance stats at current iteration
52   void reset_performance_stats();
53 
54   // Calculate avg stats using cpu timer and gpu timer
55   // that has been recorded in reducer.
56   void calculate_avg_time(
57       int64_t& avg_time,
58       int64_t& time_duration,
59       Timer& timer,
60       Timer::Event start_event,
61       Timer::Event end_event);
62 
63   // Set the absolute time of the event that has been recorded in reducer.
64   void set_event_time(int64_t& event_time, Timer& timer, Timer::Event event);
65   // Set stats that can be collected only during
66   // training loop. It is called at the beginning of forward call
67   // to record the run time stats of sampled iterations that previously ran.
68   // GPU performance stats are collected only for single process
69   // single device program and single device module right now.
70   // TODO to support single process multiple devices and multi device modules,
71   // events need to be created and recorded on multiple devices.
72   void set_runtime_stats_and_log();
73 
74   // Called when DDP/reducer is failing with an error. The
75   // logging data structure will have two fields filled: "has_error" indicating
76   // that this iteration encountered an error and other fields are not valid,
77   // and "error", a string which contains the error message that DDP failed
78   // with.
79   template <typename... Args>
set_error_and_log(const std::string & ddp_error,const Args &...args)80   void set_error_and_log(const std::string& ddp_error, const Args&... args) {
81     ddp_logging_data_->ints_map["has_error"] = 1;
82     auto err = c10::str(ddp_error, args...);
83     ddp_logging_data_->strs_map["error"] = err;
84     // Report the iteration we are erroring at so user knows how many examples
85     // successfully processed before this error was hit.
86     ddp_logging_data_->ints_map["iteration"] = reducer_->num_iterations_;
87     at::LogPyTorchDDPUsage(*ddp_logging_data_);
88   }
89 
90   // When running without static graph, called when reducer is destroyed to log
91   // if graph was actually static and is a candidate for static graph
92   // optimization.
93   void log_if_graph_static(bool is_static);
94 
95  private:
96   // ddp_logging_data_ is used to hold all the ddp related logging
97   // data fields.
98   std::unique_ptr<at::DDPLoggingData> ddp_logging_data_;
99   std::shared_ptr<c10d::Reducer> reducer_;
100   // track the number of iterations when runtime stats are collected so far.
101   long num_iterations_stats_recorded_ = 0;
102 };
103 
104 // a generic logging data struct that holds different types of logging data.
105 // starting with key value pairs of strings and integers,
106 // It can be extended to more types as needed.
107 struct C10dLoggingData {
108   // logging fields that are string types.
109   std::map<std::string, std::string> strings;
110   // logging fields that are int64_t types.
111   std::map<std::string, int64_t> integers;
112 };
113 
114 class TORCH_API C10dLogger {
115  public:
116   C10dLogger(const C10dLogger&) = default;
117   C10dLogger(C10dLogger&&) = delete;
118   C10dLogger& operator=(const C10dLogger&) = default;
119   C10dLogger& operator=(C10dLogger&&) = delete;
120   virtual ~C10dLogger() = default;
121   virtual void log(const C10dLoggingData& data);
122   static C10dLogger* getLogger();
123   static void registerLogger(std::unique_ptr<C10dLogger>);
124 
125  protected:
126   // singletion, hide constructor from the public
C10dLogger(std::string logDestination)127   C10dLogger(std::string logDestination)
128       : logDestination_(std::move(logDestination)) {}
129 
130   // the name of the destination this logger should log to
131   std::string logDestination_;
132 
133  private:
134   static std::unique_ptr<C10dLogger> logger_;
135   static std::atomic<bool> registered_;
136 };
137 
138 } // namespace c10d
139