xref: /aosp_15_r20/external/skia/tools/unicode_comparison/go/download_wiki/main.go (revision c8dee2aa9b3f27cf6c858bd81872bdeb2c07ed17)
1// Copyright 2023 The Chromium Authors. All rights reserved.
2// Use of this source code is governed by a BSD-style license that can be
3// found in the LICENSE file.
4//
5// This executable downloads some amount of wikipedia pages by given locale(s),
6// breaks them into smaller parts by sections, then by sentences and
7// writes them down into separate files
8
9package main
10
11import (
12	"flag"
13	"fmt"
14	gowiki "github.com/trietmn/go-wiki"
15	"go.skia.org/skia/tools/unicode_comparison/go/bridge"
16	"go.skia.org/skia/tools/unicode_comparison/go/helpers"
17	"os"
18	"path/filepath"
19	"strconv"
20	"strings"
21)
22
23func downloadLocalPagesBySections(searchResult []string, localInput string, lastCount, fileLimit, textLimit int) int {
24	countSentences := 0
25	for _ /*index*/, element := range searchResult {
26		// Get the page
27		page, err := gowiki.GetPage(element, -1, false, true)
28		if err != nil {
29			fmt.Println(err)
30			continue
31		}
32
33		sections, err := page.GetSectionList()
34		if err != nil {
35			fmt.Println(err)
36			continue
37		}
38		for si, section := range sections {
39			if si == len(sections)-1 {
40				// It looks like this library breaks on the last section
41				break
42			}
43			if len(section) == 0 {
44				fmt.Println("Empty section!")
45				continue
46			}
47
48			content, err := page.GetSection(section)
49			if err != nil {
50				fmt.Println(err)
51				continue
52			}
53
54			trimmed := strings.TrimSpace(content)
55			if len(trimmed) == 0 {
56				continue
57			}
58
59			// We generate broked by sentences texts from the same section
60			sentences := bridge.GetSentences(trimmed)
61			start := 0
62			for _ /*i*/, end := range sentences {
63				smallFileName := localInput + "/page." + strconv.Itoa(lastCount+countSentences+1) // + "." + strconv.Itoa(index+1) + "." + strconv.Itoa(si+1) + "." + strconv.Itoa(i+1)
64				smallText := strings.TrimSpace(trimmed[start:end])
65				if len(smallText) == 0 {
66					continue
67				} else if len(smallText) > textLimit {
68					trim := 0
69					if bridge.TrimSentence(smallText, &trim, textLimit) {
70						smallText = smallText[:trim]
71					}
72				}
73				helpers.WriteTextFile(smallFileName, smallText)
74				start = int(end)
75				countSentences += 1
76				if lastCount+countSentences >= fileLimit {
77					return countSentences
78				}
79			}
80		}
81	}
82	return countSentences
83}
84
85func main() {
86	var (
87		root      = flag.String("root", "~/datasets", "Folder (pages will be downloaded under <Folder>/input")
88		locale    = flag.String("locale", "*", "Locale")
89		pattern   = flag.String("pattern", "*", "Pattern for search")
90		fileLimit = flag.Int("fileLimit", 10, "Number of text files to download")
91		pageLimit = flag.Int("pageLimit", 5, "Number of pages to download in one attempt")
92		textLimit = flag.Int("textLimit", 1000, "Max length of a single text")
93		verbose   = flag.Bool("verbose", true, "Print more details about the process")
94	)
95	flag.Parse()
96	if *root == "" {
97		fmt.Println("Must set --root")
98		flag.PrintDefaults()
99	}
100
101	if !bridge.InitUnicode("icu") {
102		return
103	}
104
105	*root = helpers.ExpandPath(*root)
106	input := filepath.Join(*root, "input")
107
108	if *verbose {
109		fmt.Printf("Downloading wiki pages:")
110		fmt.Printf("root=%v\n", *root)
111		fmt.Printf("locale=%v\n", *locale)
112		fmt.Printf("pattern=%v\n", *pattern)
113		fmt.Printf("fileLimit=%v\n", *fileLimit)
114		fmt.Printf("pageLimit=%v\n", *pageLimit)
115		fmt.Printf("textLimit=%v\n", *textLimit)
116	}
117
118	locales := []string{}
119	if *locale != "*" {
120		locales = strings.Split(*locale, ",")
121	} else {
122		// Sorted down by number of wiki pages
123		locales = []string{"en", "ru", "it", "de", "ro", "uk", "fa", "he", "fi", "fr", "zh", "ar", "id", "tr", "th", "vi", "lv", "lt", "hr", "az", "el", "ms", "bn", "te", "ur"}
124		// "ka", "pt" do not get downloaded properly
125	}
126
127	for _, loc := range locales {
128		localInput := filepath.Join(input, loc)
129
130		err := os.MkdirAll(localInput, os.ModePerm)
131		helpers.Check(err)
132
133		gowiki.SetLanguage(loc)
134		fileCount := 0
135		attempt := *fileLimit * 10
136		for fileCount < *fileLimit && attempt > 0 {
137			files := 0
138			if *pattern == "*" {
139				searchResult, err := gowiki.GetRandom(*pageLimit)
140				if err != nil {
141					attempt -= 1
142					fmt.Printf("Cannot download %d random pages for locale %s:\n%s\n", *pageLimit, loc, err)
143					continue
144				}
145				files = downloadLocalPagesBySections(searchResult, localInput, fileCount, *fileLimit, *textLimit)
146			} else {
147				searchResult, _, err := gowiki.Search(*pattern, *pageLimit, true)
148				helpers.Check(err)
149				files = downloadLocalPagesBySections(searchResult, localInput, fileCount, *fileLimit, *textLimit)
150			}
151			if files == 0 {
152				attempt -= 1
153			} else {
154				fileCount += files
155			}
156		}
157		if *verbose {
158			if fileCount >= *fileLimit {
159				fmt.Printf("Locale %s (%v files)\n", loc, fileCount)
160			} else if fileCount == 0 {
161				fmt.Printf("Locale %s does not containt text on %v attempts to download\n", loc, *fileLimit)
162			} else {
163				fmt.Printf("Locale %s containt less texts than %v on %v attempts to download\n", loc, *fileLimit, *fileLimit)
164			}
165		}
166	}
167
168	bridge.CleanupUnicode()
169}
170