xref: /aosp_15_r20/external/cronet/net/tools/tld_cleanup/tld_cleanup_util.cc (revision 6777b5387eb2ff775bb5750e3f5d96f37fb7352b)
1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4 
5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
6 
7 #include <sstream>
8 #include <string>
9 
10 #include "base/containers/contains.h"
11 #include "base/files/file_util.h"
12 #include "base/logging.h"
13 #include "base/ranges/algorithm.h"
14 #include "base/strings/strcat.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/string_util.h"
17 #include "url/gurl.h"
18 #include "url/third_party/mozilla/url_parse.h"
19 
20 namespace {
21 
22 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
23 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
24 
25 const int kExceptionRule = 1;
26 const int kWildcardRule = 2;
27 const int kPrivateRule = 4;
28 }
29 
30 namespace net::tld_cleanup {
31 
RulesToGperf(const RuleMap & rules)32 std::string RulesToGperf(const RuleMap& rules) {
33   std::string data;
34   data.append("%{\n"
35               "// Copyright 2012 The Chromium Authors\n"
36               "// Use of this source code is governed by a BSD-style license "
37               "that can be\n"
38               "// found in the LICENSE file.\n\n"
39               "// This file is generated by net/tools/tld_cleanup/.\n"
40               "// DO NOT MANUALLY EDIT!\n"
41               "%}\n"
42               "struct DomainRule {\n"
43               "  int name_offset;\n"
44               "  int type;  // flags: 1: exception, 2: wildcard, 4: private\n"
45               "};\n"
46               "%%\n");
47 
48   for (const auto& [domain, rule] : rules) {
49     data.append(domain);
50     data.append(", ");
51     int type = 0;
52     if (rule.exception) {
53       type = kExceptionRule;
54     } else if (rule.wildcard) {
55       type = kWildcardRule;
56     }
57     if (rule.is_private) {
58       type += kPrivateRule;
59     }
60     data.append(base::NumberToString(type));
61     data.append("\n");
62   }
63 
64   data.append("%%\n");
65 
66   return data;
67 }
68 
69 // Adjusts the rule to a standard form: removes single extraneous dots and
70 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
71 // valid; logs a warning and returns kWarning if it is probably invalid; and
72 // logs an error and returns kError if the rule is (almost) certainly invalid.
NormalizeRule(std::string & domain,Rule & rule)73 NormalizeResult NormalizeRule(std::string& domain, Rule& rule) {
74   NormalizeResult result = NormalizeResult::kSuccess;
75 
76   // Strip single leading and trailing dots.
77   if (domain.starts_with(".")) {
78     domain.erase(0, 1);
79   }
80   if (domain.ends_with(".")) {
81     domain.pop_back();
82   }
83 
84   // Allow single leading '*.' or '!', saved here so it's not canonicalized.
85   if (domain.starts_with("!")) {
86     domain.erase(0, 1);
87     rule.exception = true;
88   } else if (domain.starts_with("*.")) {
89     domain.erase(0, 2);
90     rule.wildcard = true;
91   }
92   if (domain.empty()) {
93     LOG(WARNING) << "Ignoring empty rule";
94     return NormalizeResult::kWarning;
95   }
96 
97   // Warn about additional '*.' or '!'.
98   if (base::Contains(domain, "*.") || base::Contains(domain, '!')) {
99     LOG(WARNING) << "Keeping probably invalid rule: " << domain;
100     result = NormalizeResult::kWarning;
101   }
102 
103   // Make a GURL and normalize it, then get the host back out.
104   GURL gurl(base::StrCat({"http://", domain}));
105   const std::string& spec = gurl.possibly_invalid_spec();
106   url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
107   if (!host.is_valid()) {
108     LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << domain;
109     return NormalizeResult::kError;
110   }
111   if (!gurl.is_valid()) {
112     LOG(WARNING) << "Keeping rule that GURL says is invalid: " << domain;
113     result = NormalizeResult::kWarning;
114   }
115   domain.assign(spec.substr(host.begin, host.len));
116 
117   return result;
118 }
119 
NormalizeDataToRuleMap(const std::string & data,RuleMap & rules)120 NormalizeResult NormalizeDataToRuleMap(const std::string& data,
121                                        RuleMap& rules) {
122   // We do a lot of string assignment during parsing, but simplicity is more
123   // important than performance here.
124   NormalizeResult result = NormalizeResult::kSuccess;
125   std::istringstream data_stream(data);
126 
127   bool in_private_section = false;
128   RuleMap extra_rules;
129 
130   for (std::string line; std::getline(data_stream, line, '\n');) {
131     if (line.starts_with(kBeginPrivateDomainsComment)) {
132       in_private_section = true;
133       continue;
134     }
135     if (line.starts_with(kEndPrivateDomainsComment)) {
136       in_private_section = false;
137       continue;
138     }
139     if (line.starts_with("//")) {
140       // Skip comments.
141       continue;
142     }
143     if (line.empty()) {
144       continue;
145     }
146 
147     // Truncate at first whitespace.
148     if (size_t first_whitespace = line.find_first_of("\r\n \t");
149         first_whitespace != std::string::npos) {
150       line.erase(first_whitespace);
151     }
152     std::string domain = line;
153 
154     Rule rule{/*exception=*/false, /*wildcard=*/false,
155               /*is_private=*/in_private_section};
156     NormalizeResult new_result = NormalizeRule(domain, rule);
157     result = std::max(result, new_result);
158     if (new_result == NormalizeResult::kError) {
159       continue;
160     }
161 
162     // Check the existing rules to make sure we don't have an exception and
163     // wildcard for the same rule, or that the same domain is listed as both
164     // private and not private. If we did, we'd have to update our
165     // parsing code to handle this case.
166     CHECK(!base::Contains(rules, domain))
167         << "Duplicate rule found for " << domain;
168 
169     rules[domain] = rule;
170     // Add true TLD for multi-level rules.  We don't add them right now, in
171     // case there's an exception or wild card that either exists or might be
172     // added in a later iteration.  In those cases, there's no need to add
173     // it and it would just slow down parsing the data.
174     size_t tld_start = domain.find_last_of('.');
175     if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
176       std::string extra_rule_domain = domain.substr(tld_start + 1);
177       RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
178       // If a rule already exists, we ensure that if any of the entries is not
179       // private the result should be that the entry is not private.  An example
180       // is .au which is not listed as a real TLD, but only lists second-level
181       // domains such as com.au. Subdomains of .au (eg. blogspot.com.au) are
182       // also listed in the private section, which is processed later, so this
183       // ensures that the real TLD (eg. .au) is listed as public.
184       bool is_private = in_private_section &&
185                         (iter == extra_rules.end() || iter->second.is_private);
186       extra_rules[extra_rule_domain] =
187           Rule{/*exception=*/false, /*wildcard=*/false, is_private};
188     }
189   }
190 
191   base::ranges::copy_if(extra_rules, std::inserter(rules, rules.end()),
192                         [&](const auto& extra_rule) {
193                           return !base::Contains(rules, extra_rule.first);
194                         });
195 
196   return result;
197 }
198 
NormalizeFile(const base::FilePath & in_filename,const base::FilePath & out_filename)199 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
200                               const base::FilePath& out_filename) {
201   RuleMap rules;
202   std::string data;
203   if (!base::ReadFileToString(in_filename, &data)) {
204     LOG(ERROR) << "Unable to read file";
205     // We return success since we've already reported the error.
206     return NormalizeResult::kSuccess;
207   }
208 
209   NormalizeResult result = NormalizeDataToRuleMap(data, rules);
210 
211   if (!base::WriteFile(out_filename, RulesToGperf(rules))) {
212     LOG(ERROR) << "Error(s) writing output file";
213     result = NormalizeResult::kError;
214   }
215 
216   return result;
217 }
218 
219 }  // namespace net::tld_cleanup
220