1 // Copyright 2013 The Chromium Authors
2 // Use of this source code is governed by a BSD-style license that can be
3 // found in the LICENSE file.
4
5 #include "net/tools/tld_cleanup/tld_cleanup_util.h"
6
7 #include <sstream>
8 #include <string>
9
10 #include "base/containers/contains.h"
11 #include "base/files/file_util.h"
12 #include "base/logging.h"
13 #include "base/ranges/algorithm.h"
14 #include "base/strings/strcat.h"
15 #include "base/strings/string_number_conversions.h"
16 #include "base/strings/string_util.h"
17 #include "url/gurl.h"
18 #include "url/third_party/mozilla/url_parse.h"
19
20 namespace {
21
22 const char kBeginPrivateDomainsComment[] = "// ===BEGIN PRIVATE DOMAINS===";
23 const char kEndPrivateDomainsComment[] = "// ===END PRIVATE DOMAINS===";
24
25 const int kExceptionRule = 1;
26 const int kWildcardRule = 2;
27 const int kPrivateRule = 4;
28 }
29
30 namespace net::tld_cleanup {
31
RulesToGperf(const RuleMap & rules)32 std::string RulesToGperf(const RuleMap& rules) {
33 std::string data;
34 data.append("%{\n"
35 "// Copyright 2012 The Chromium Authors\n"
36 "// Use of this source code is governed by a BSD-style license "
37 "that can be\n"
38 "// found in the LICENSE file.\n\n"
39 "// This file is generated by net/tools/tld_cleanup/.\n"
40 "// DO NOT MANUALLY EDIT!\n"
41 "%}\n"
42 "struct DomainRule {\n"
43 " int name_offset;\n"
44 " int type; // flags: 1: exception, 2: wildcard, 4: private\n"
45 "};\n"
46 "%%\n");
47
48 for (const auto& [domain, rule] : rules) {
49 data.append(domain);
50 data.append(", ");
51 int type = 0;
52 if (rule.exception) {
53 type = kExceptionRule;
54 } else if (rule.wildcard) {
55 type = kWildcardRule;
56 }
57 if (rule.is_private) {
58 type += kPrivateRule;
59 }
60 data.append(base::NumberToString(type));
61 data.append("\n");
62 }
63
64 data.append("%%\n");
65
66 return data;
67 }
68
69 // Adjusts the rule to a standard form: removes single extraneous dots and
70 // canonicalizes it using GURL. Returns kSuccess if the rule is interpreted as
71 // valid; logs a warning and returns kWarning if it is probably invalid; and
72 // logs an error and returns kError if the rule is (almost) certainly invalid.
NormalizeRule(std::string & domain,Rule & rule)73 NormalizeResult NormalizeRule(std::string& domain, Rule& rule) {
74 NormalizeResult result = NormalizeResult::kSuccess;
75
76 // Strip single leading and trailing dots.
77 if (domain.starts_with(".")) {
78 domain.erase(0, 1);
79 }
80 if (domain.ends_with(".")) {
81 domain.pop_back();
82 }
83
84 // Allow single leading '*.' or '!', saved here so it's not canonicalized.
85 if (domain.starts_with("!")) {
86 domain.erase(0, 1);
87 rule.exception = true;
88 } else if (domain.starts_with("*.")) {
89 domain.erase(0, 2);
90 rule.wildcard = true;
91 }
92 if (domain.empty()) {
93 LOG(WARNING) << "Ignoring empty rule";
94 return NormalizeResult::kWarning;
95 }
96
97 // Warn about additional '*.' or '!'.
98 if (base::Contains(domain, "*.") || base::Contains(domain, '!')) {
99 LOG(WARNING) << "Keeping probably invalid rule: " << domain;
100 result = NormalizeResult::kWarning;
101 }
102
103 // Make a GURL and normalize it, then get the host back out.
104 GURL gurl(base::StrCat({"http://", domain}));
105 const std::string& spec = gurl.possibly_invalid_spec();
106 url::Component host = gurl.parsed_for_possibly_invalid_spec().host;
107 if (!host.is_valid()) {
108 LOG(ERROR) << "Ignoring rule that couldn't be normalized: " << domain;
109 return NormalizeResult::kError;
110 }
111 if (!gurl.is_valid()) {
112 LOG(WARNING) << "Keeping rule that GURL says is invalid: " << domain;
113 result = NormalizeResult::kWarning;
114 }
115 domain.assign(spec.substr(host.begin, host.len));
116
117 return result;
118 }
119
NormalizeDataToRuleMap(const std::string & data,RuleMap & rules)120 NormalizeResult NormalizeDataToRuleMap(const std::string& data,
121 RuleMap& rules) {
122 // We do a lot of string assignment during parsing, but simplicity is more
123 // important than performance here.
124 NormalizeResult result = NormalizeResult::kSuccess;
125 std::istringstream data_stream(data);
126
127 bool in_private_section = false;
128 RuleMap extra_rules;
129
130 for (std::string line; std::getline(data_stream, line, '\n');) {
131 if (line.starts_with(kBeginPrivateDomainsComment)) {
132 in_private_section = true;
133 continue;
134 }
135 if (line.starts_with(kEndPrivateDomainsComment)) {
136 in_private_section = false;
137 continue;
138 }
139 if (line.starts_with("//")) {
140 // Skip comments.
141 continue;
142 }
143 if (line.empty()) {
144 continue;
145 }
146
147 // Truncate at first whitespace.
148 if (size_t first_whitespace = line.find_first_of("\r\n \t");
149 first_whitespace != std::string::npos) {
150 line.erase(first_whitespace);
151 }
152 std::string domain = line;
153
154 Rule rule{/*exception=*/false, /*wildcard=*/false,
155 /*is_private=*/in_private_section};
156 NormalizeResult new_result = NormalizeRule(domain, rule);
157 result = std::max(result, new_result);
158 if (new_result == NormalizeResult::kError) {
159 continue;
160 }
161
162 // Check the existing rules to make sure we don't have an exception and
163 // wildcard for the same rule, or that the same domain is listed as both
164 // private and not private. If we did, we'd have to update our
165 // parsing code to handle this case.
166 CHECK(!base::Contains(rules, domain))
167 << "Duplicate rule found for " << domain;
168
169 rules[domain] = rule;
170 // Add true TLD for multi-level rules. We don't add them right now, in
171 // case there's an exception or wild card that either exists or might be
172 // added in a later iteration. In those cases, there's no need to add
173 // it and it would just slow down parsing the data.
174 size_t tld_start = domain.find_last_of('.');
175 if (tld_start != std::string::npos && tld_start + 1 < domain.size()) {
176 std::string extra_rule_domain = domain.substr(tld_start + 1);
177 RuleMap::const_iterator iter = extra_rules.find(extra_rule_domain);
178 // If a rule already exists, we ensure that if any of the entries is not
179 // private the result should be that the entry is not private. An example
180 // is .au which is not listed as a real TLD, but only lists second-level
181 // domains such as com.au. Subdomains of .au (eg. blogspot.com.au) are
182 // also listed in the private section, which is processed later, so this
183 // ensures that the real TLD (eg. .au) is listed as public.
184 bool is_private = in_private_section &&
185 (iter == extra_rules.end() || iter->second.is_private);
186 extra_rules[extra_rule_domain] =
187 Rule{/*exception=*/false, /*wildcard=*/false, is_private};
188 }
189 }
190
191 base::ranges::copy_if(extra_rules, std::inserter(rules, rules.end()),
192 [&](const auto& extra_rule) {
193 return !base::Contains(rules, extra_rule.first);
194 });
195
196 return result;
197 }
198
NormalizeFile(const base::FilePath & in_filename,const base::FilePath & out_filename)199 NormalizeResult NormalizeFile(const base::FilePath& in_filename,
200 const base::FilePath& out_filename) {
201 RuleMap rules;
202 std::string data;
203 if (!base::ReadFileToString(in_filename, &data)) {
204 LOG(ERROR) << "Unable to read file";
205 // We return success since we've already reported the error.
206 return NormalizeResult::kSuccess;
207 }
208
209 NormalizeResult result = NormalizeDataToRuleMap(data, rules);
210
211 if (!base::WriteFile(out_filename, RulesToGperf(rules))) {
212 LOG(ERROR) << "Error(s) writing output file";
213 result = NormalizeResult::kError;
214 }
215
216 return result;
217 }
218
219 } // namespace net::tld_cleanup
220