xref: /aosp_15_r20/external/cldr/tools/scripts/tr-archive/extract-link-targets.js (revision 912701f9769bb47905792267661f0baf2b85bed5)
1*912701f9SAndroid Build Coastguard Workerconst fs = require("fs").promises;
2*912701f9SAndroid Build Coastguard Workerconst jsdom = require("jsdom");
3*912701f9SAndroid Build Coastguard Workerconst { JSDOM } = jsdom;
4*912701f9SAndroid Build Coastguard Workerconst path = require("path");
5*912701f9SAndroid Build Coastguard Worker
6*912701f9SAndroid Build Coastguard Worker/**
7*912701f9SAndroid Build Coastguard Worker * Run this after outputting html into 'dist'
8*912701f9SAndroid Build Coastguard Worker * It will update ../../../docs/ldml/*.anchors.json
9*912701f9SAndroid Build Coastguard Worker * Use source control to see if the links have changed.
10*912701f9SAndroid Build Coastguard Worker */
11*912701f9SAndroid Build Coastguard Worker
12*912701f9SAndroid Build Coastguard Worker// We would ideally run marked and process the output here.
13*912701f9SAndroid Build Coastguard Worker// But that might introduce duplicate code.
14*912701f9SAndroid Build Coastguard Workerconst DONE_ICON = "✅";
15*912701f9SAndroid Build Coastguard Workerconst GEAR_ICON = "⚙️";
16*912701f9SAndroid Build Coastguard Workerconst NONE_ICON = "∅";
17*912701f9SAndroid Build Coastguard Workerconst PACKAGE_ICON = "��";
18*912701f9SAndroid Build Coastguard Workerconst SECTION_ICON = "��";
19*912701f9SAndroid Build Coastguard Workerconst TYPE_ICON = "��";
20*912701f9SAndroid Build Coastguard Workerconst WARN_ICON = "⚠️";
21*912701f9SAndroid Build Coastguard Workerconst POINT_ICON = "��";
22*912701f9SAndroid Build Coastguard Workerconst MISSING_ICON = "❌";
23*912701f9SAndroid Build Coastguard Worker
24*912701f9SAndroid Build Coastguard Worker/**
25*912701f9SAndroid Build Coastguard Worker *
26*912701f9SAndroid Build Coastguard Worker * @param {string} targetSection e.g. 'tr35-info'
27*912701f9SAndroid Build Coastguard Worker * @param {string} anchor e.g. 'Parts'
28*912701f9SAndroid Build Coastguard Worker * @returns 'tr35-info.md#Parts'
29*912701f9SAndroid Build Coastguard Worker */
30*912701f9SAndroid Build Coastguard Workerfunction constructLink(targetSection, anchor) {
31*912701f9SAndroid Build Coastguard Worker  const page = `${targetSection}.md`;
32*912701f9SAndroid Build Coastguard Worker  if (!anchor) {
33*912701f9SAndroid Build Coastguard Worker    return page;
34*912701f9SAndroid Build Coastguard Worker  }
35*912701f9SAndroid Build Coastguard Worker  return `${page}#${anchor}`;
36*912701f9SAndroid Build Coastguard Worker}
37*912701f9SAndroid Build Coastguard Worker
38*912701f9SAndroid Build Coastguard Worker/**
39*912701f9SAndroid Build Coastguard Worker * Read the input .md file, and write to a corresponding .html file
40*912701f9SAndroid Build Coastguard Worker * @param {string} infile path to input file
41*912701f9SAndroid Build Coastguard Worker * @returns {Promise<string>} name of output file (for status update)
42*912701f9SAndroid Build Coastguard Worker */
43*912701f9SAndroid Build Coastguard Workerasync function extractAnchors(infile) {
44*912701f9SAndroid Build Coastguard Worker  const basename = path.basename(infile, ".html");
45*912701f9SAndroid Build Coastguard Worker  dirname = '../../../docs/ldml';
46*912701f9SAndroid Build Coastguard Worker  console.log(`${SECTION_ICON} Reading ${infile}`);
47*912701f9SAndroid Build Coastguard Worker  let f1 = await fs.readFile(infile, "utf-8");
48*912701f9SAndroid Build Coastguard Worker
49*912701f9SAndroid Build Coastguard Worker  // oh the irony of removing a BOM before posting to unicode.org
50*912701f9SAndroid Build Coastguard Worker  if (f1.charCodeAt(0) == 0xfeff) {
51*912701f9SAndroid Build Coastguard Worker    f1 = f1.substring(3);
52*912701f9SAndroid Build Coastguard Worker  }
53*912701f9SAndroid Build Coastguard Worker
54*912701f9SAndroid Build Coastguard Worker  const rawHtml = f1;
55*912701f9SAndroid Build Coastguard Worker
56*912701f9SAndroid Build Coastguard Worker  // now fix. Spin up a JSDOM so we can manipulate
57*912701f9SAndroid Build Coastguard Worker  const dom = new JSDOM(rawHtml);
58*912701f9SAndroid Build Coastguard Worker  const document = dom.window.document;
59*912701f9SAndroid Build Coastguard Worker
60*912701f9SAndroid Build Coastguard Worker  const anchors = new Set();
61*912701f9SAndroid Build Coastguard Worker  const targets = new Set();
62*912701f9SAndroid Build Coastguard Worker
63*912701f9SAndroid Build Coastguard Worker  function addAnchor(n) {
64*912701f9SAndroid Build Coastguard Worker    if (!n) return;
65*912701f9SAndroid Build Coastguard Worker    if (anchors.has(n)) {
66*912701f9SAndroid Build Coastguard Worker      console.error(`${WARN_ICON} ${constructLink(basename)}: Duplicate anchor: #${n}`);
67*912701f9SAndroid Build Coastguard Worker    } else {
68*912701f9SAndroid Build Coastguard Worker      anchors.add(n);
69*912701f9SAndroid Build Coastguard Worker    }
70*912701f9SAndroid Build Coastguard Worker  }
71*912701f9SAndroid Build Coastguard Worker
72*912701f9SAndroid Build Coastguard Worker  function addTarget(href) {
73*912701f9SAndroid Build Coastguard Worker    const INTRA_PAGE_LINK = /^#(.*)$/; // starts with #  => 1=anchor
74*912701f9SAndroid Build Coastguard Worker    const TR_SECTION_LINK = /^(tr35(?:[^.]*)).html(?:#(.*)){0,1}$/; // => 1=basename, 2=anchor
75*912701f9SAndroid Build Coastguard Worker    const EXTERNAL_LINK = /^(http|https|mailto|ftp):.*$/; // scheme
76*912701f9SAndroid Build Coastguard Worker    // Error on all other links
77*912701f9SAndroid Build Coastguard Worker
78*912701f9SAndroid Build Coastguard Worker    const intra_page = INTRA_PAGE_LINK.exec(href);
79*912701f9SAndroid Build Coastguard Worker    const tr_section = TR_SECTION_LINK.exec(href);
80*912701f9SAndroid Build Coastguard Worker    const external   = EXTERNAL_LINK.exec(href);
81*912701f9SAndroid Build Coastguard Worker    if (intra_page) {
82*912701f9SAndroid Build Coastguard Worker      // same page
83*912701f9SAndroid Build Coastguard Worker      targets.add(constructLink(basename, intra_page[1]));
84*912701f9SAndroid Build Coastguard Worker    } else if (tr_section) {
85*912701f9SAndroid Build Coastguard Worker      // another page
86*912701f9SAndroid Build Coastguard Worker      targets.add(constructLink(tr_section[1], tr_section[2]));
87*912701f9SAndroid Build Coastguard Worker    } else if (external) {
88*912701f9SAndroid Build Coastguard Worker      // external
89*912701f9SAndroid Build Coastguard Worker      // Do nothing
90*912701f9SAndroid Build Coastguard Worker      // TODO: add to list of external links?
91*912701f9SAndroid Build Coastguard Worker    } else {
92*912701f9SAndroid Build Coastguard Worker      console.error(`${WARN_ICON} ${basename}: Unknown anchor: ${href}`);
93*912701f9SAndroid Build Coastguard Worker    }
94*912701f9SAndroid Build Coastguard Worker  }
95*912701f9SAndroid Build Coastguard Worker
96*912701f9SAndroid Build Coastguard Worker  // extract anchors
97*912701f9SAndroid Build Coastguard Worker  for (const a of dom.window.document.getElementsByTagName("*")) {
98*912701f9SAndroid Build Coastguard Worker    const id = a.getAttribute("id");
99*912701f9SAndroid Build Coastguard Worker    addAnchor(id);
100*912701f9SAndroid Build Coastguard Worker
101*912701f9SAndroid Build Coastguard Worker    if (a.tagName === 'A') {
102*912701f9SAndroid Build Coastguard Worker      const name = a.getAttribute("name");
103*912701f9SAndroid Build Coastguard Worker      addAnchor(name);
104*912701f9SAndroid Build Coastguard Worker    }
105*912701f9SAndroid Build Coastguard Worker  }
106*912701f9SAndroid Build Coastguard Worker  // extract targets
107*912701f9SAndroid Build Coastguard Worker  for (const a of dom.window.document.getElementsByTagName("A")) {
108*912701f9SAndroid Build Coastguard Worker    const href = a.getAttribute("href");
109*912701f9SAndroid Build Coastguard Worker    if (href) {
110*912701f9SAndroid Build Coastguard Worker      addTarget(href);
111*912701f9SAndroid Build Coastguard Worker    }
112*912701f9SAndroid Build Coastguard Worker  }
113*912701f9SAndroid Build Coastguard Worker
114*912701f9SAndroid Build Coastguard Worker  const coll = new Intl.Collator(['und']);
115*912701f9SAndroid Build Coastguard Worker  const anchorList = Array.from(anchors.values()).sort(coll.compare);
116*912701f9SAndroid Build Coastguard Worker  const anchorFile = path.join(dirname, `${basename}.anchors.json`);
117*912701f9SAndroid Build Coastguard Worker  await fs.writeFile(anchorFile, JSON.stringify(anchorList, null, '  '));
118*912701f9SAndroid Build Coastguard Worker  const targetList = Array.from(targets.values()).sort(coll.compare);
119*912701f9SAndroid Build Coastguard Worker  return [basename, anchorList, targetList];
120*912701f9SAndroid Build Coastguard Worker}
121*912701f9SAndroid Build Coastguard Worker
122*912701f9SAndroid Build Coastguard Worker/**
123*912701f9SAndroid Build Coastguard Worker * Convert all files
124*912701f9SAndroid Build Coastguard Worker * @returns Promise list of output files
125*912701f9SAndroid Build Coastguard Worker */
126*912701f9SAndroid Build Coastguard Workerasync function extractAll() {
127*912701f9SAndroid Build Coastguard Worker  outbox = "./dist";
128*912701f9SAndroid Build Coastguard Worker
129*912701f9SAndroid Build Coastguard Worker  const fileList = (await fs.readdir(outbox))
130*912701f9SAndroid Build Coastguard Worker    .filter((f) => /\.html$/.test(f))
131*912701f9SAndroid Build Coastguard Worker    .map((f) => path.join(outbox, f));
132*912701f9SAndroid Build Coastguard Worker  return Promise.all(fileList.map(extractAnchors));
133*912701f9SAndroid Build Coastguard Worker}
134*912701f9SAndroid Build Coastguard Worker
135*912701f9SAndroid Build Coastguard Workerasync function checkAll() {
136*912701f9SAndroid Build Coastguard Worker  console.log(`${GEAR_ICON} Reading HTML`);
137*912701f9SAndroid Build Coastguard Worker  const checked = await extractAll();
138*912701f9SAndroid Build Coastguard Worker  console.log(`${GEAR_ICON} Collecting internal links`);
139*912701f9SAndroid Build Coastguard Worker
140*912701f9SAndroid Build Coastguard Worker  const allInternalTargets = new Set();
141*912701f9SAndroid Build Coastguard Worker  const allInternalAnchors = new Set();
142*912701f9SAndroid Build Coastguard Worker  const sectionToTargets = {
143*912701f9SAndroid Build Coastguard Worker    // e.g.  "tr35-info" : Set(["tr35-keyboards.md#Element_keyboard", …])
144*912701f9SAndroid Build Coastguard Worker  };
145*912701f9SAndroid Build Coastguard Worker  checked.forEach(([sourceSection,anchorList,targetList]) => {
146*912701f9SAndroid Build Coastguard Worker    allInternalAnchors.add(constructLink(sourceSection)); // example: 'tr35-collation.md'
147*912701f9SAndroid Build Coastguard Worker    targetList.forEach(target => allInternalTargets.add(target));
148*912701f9SAndroid Build Coastguard Worker    sectionToTargets[sourceSection] = new Set(targetList); // for error checking
149*912701f9SAndroid Build Coastguard Worker    const myInternalAnchors = anchorList.map(anchor => constructLink(sourceSection, anchor));
150*912701f9SAndroid Build Coastguard Worker    myInternalAnchors.forEach(anchor => allInternalAnchors.add(anchor)); // tr35-collation.md#Parts
151*912701f9SAndroid Build Coastguard Worker  });
152*912701f9SAndroid Build Coastguard Worker
153*912701f9SAndroid Build Coastguard Worker  console.log(`${GEAR_ICON} Checking ${allInternalTargets.size} internal links against ${allInternalAnchors.size} anchors`);
154*912701f9SAndroid Build Coastguard Worker
155*912701f9SAndroid Build Coastguard Worker  const missingInternalLinks = new Set();
156*912701f9SAndroid Build Coastguard Worker
157*912701f9SAndroid Build Coastguard Worker  for (const expectedAnchor of allInternalTargets.values()) {
158*912701f9SAndroid Build Coastguard Worker    if (!allInternalAnchors.has(expectedAnchor)) {
159*912701f9SAndroid Build Coastguard Worker      missingInternalLinks.add(expectedAnchor);
160*912701f9SAndroid Build Coastguard Worker    }
161*912701f9SAndroid Build Coastguard Worker  }
162*912701f9SAndroid Build Coastguard Worker
163*912701f9SAndroid Build Coastguard Worker  if (!!missingInternalLinks.size) {
164*912701f9SAndroid Build Coastguard Worker    for (expectedAnchor of missingInternalLinks.values()) {
165*912701f9SAndroid Build Coastguard Worker      // coalesce
166*912701f9SAndroid Build Coastguard Worker      const sourceSections = ((Object.entries(sectionToTargets)
167*912701f9SAndroid Build Coastguard Worker        .filter(([section,s]) => s.has(expectedAnchor))) // Does this section target this anchor?
168*912701f9SAndroid Build Coastguard Worker        .map(([section]) => constructLink(section)) // drop the set
169*912701f9SAndroid Build Coastguard Worker        .join(' & ') // join section name(s)
170*912701f9SAndroid Build Coastguard Worker      ) || '(unknown section(s))'; // error
171*912701f9SAndroid Build Coastguard Worker      console.error(`${MISSING_ICON} Broken internal link: ${sourceSections}: (${expectedAnchor})`);
172*912701f9SAndroid Build Coastguard Worker    }
173*912701f9SAndroid Build Coastguard Worker    console.error(`${WARN_ICON} ${missingInternalLinks.size} missing links.`);
174*912701f9SAndroid Build Coastguard Worker    process.exitCode = 1;
175*912701f9SAndroid Build Coastguard Worker  }
176*912701f9SAndroid Build Coastguard Worker
177*912701f9SAndroid Build Coastguard Worker  console.log(`${POINT_ICON} use: 'lychee --cache docs/ldml' to check external links`);
178*912701f9SAndroid Build Coastguard Worker
179*912701f9SAndroid Build Coastguard Worker  return checked.map(([anchorFile]) => anchorFile);
180*912701f9SAndroid Build Coastguard Worker}
181*912701f9SAndroid Build Coastguard WorkercheckAll().then(
182*912701f9SAndroid Build Coastguard Worker  (x) => x.forEach(section => {
183*912701f9SAndroid Build Coastguard Worker    console.log(`${DONE_ICON} ${constructLink(section)}`);
184*912701f9SAndroid Build Coastguard Worker  }),
185*912701f9SAndroid Build Coastguard Worker  (e) => {
186*912701f9SAndroid Build Coastguard Worker    console.error(e);
187*912701f9SAndroid Build Coastguard Worker    process.exitCode = 1;
188*912701f9SAndroid Build Coastguard Worker  }
189*912701f9SAndroid Build Coastguard Worker);
190