1// Copyright 2023 The Chromium Authors. All rights reserved. 2// Use of this source code is governed by a BSD-style license that can be 3// found in the LICENSE file. 4// 5// This executable downloads some amount of wikipedia pages by given locale(s), 6// breaks them into smaller parts by sections, then by sentences and 7// writes them down into separate files 8 9package main 10 11import ( 12 "flag" 13 "fmt" 14 gowiki "github.com/trietmn/go-wiki" 15 "go.skia.org/skia/tools/unicode_comparison/go/bridge" 16 "go.skia.org/skia/tools/unicode_comparison/go/helpers" 17 "os" 18 "path/filepath" 19 "strconv" 20 "strings" 21) 22 23func downloadLocalPagesBySections(searchResult []string, localInput string, lastCount, fileLimit, textLimit int) int { 24 countSentences := 0 25 for _ /*index*/, element := range searchResult { 26 // Get the page 27 page, err := gowiki.GetPage(element, -1, false, true) 28 if err != nil { 29 fmt.Println(err) 30 continue 31 } 32 33 sections, err := page.GetSectionList() 34 if err != nil { 35 fmt.Println(err) 36 continue 37 } 38 for si, section := range sections { 39 if si == len(sections)-1 { 40 // It looks like this library breaks on the last section 41 break 42 } 43 if len(section) == 0 { 44 fmt.Println("Empty section!") 45 continue 46 } 47 48 content, err := page.GetSection(section) 49 if err != nil { 50 fmt.Println(err) 51 continue 52 } 53 54 trimmed := strings.TrimSpace(content) 55 if len(trimmed) == 0 { 56 continue 57 } 58 59 // We generate broked by sentences texts from the same section 60 sentences := bridge.GetSentences(trimmed) 61 start := 0 62 for _ /*i*/, end := range sentences { 63 smallFileName := localInput + "/page." + strconv.Itoa(lastCount+countSentences+1) // + "." + strconv.Itoa(index+1) + "." + strconv.Itoa(si+1) + "." + strconv.Itoa(i+1) 64 smallText := strings.TrimSpace(trimmed[start:end]) 65 if len(smallText) == 0 { 66 continue 67 } else if len(smallText) > textLimit { 68 trim := 0 69 if bridge.TrimSentence(smallText, &trim, textLimit) { 70 smallText = smallText[:trim] 71 } 72 } 73 helpers.WriteTextFile(smallFileName, smallText) 74 start = int(end) 75 countSentences += 1 76 if lastCount+countSentences >= fileLimit { 77 return countSentences 78 } 79 } 80 } 81 } 82 return countSentences 83} 84 85func main() { 86 var ( 87 root = flag.String("root", "~/datasets", "Folder (pages will be downloaded under <Folder>/input") 88 locale = flag.String("locale", "*", "Locale") 89 pattern = flag.String("pattern", "*", "Pattern for search") 90 fileLimit = flag.Int("fileLimit", 10, "Number of text files to download") 91 pageLimit = flag.Int("pageLimit", 5, "Number of pages to download in one attempt") 92 textLimit = flag.Int("textLimit", 1000, "Max length of a single text") 93 verbose = flag.Bool("verbose", true, "Print more details about the process") 94 ) 95 flag.Parse() 96 if *root == "" { 97 fmt.Println("Must set --root") 98 flag.PrintDefaults() 99 } 100 101 if !bridge.InitUnicode("icu") { 102 return 103 } 104 105 *root = helpers.ExpandPath(*root) 106 input := filepath.Join(*root, "input") 107 108 if *verbose { 109 fmt.Printf("Downloading wiki pages:") 110 fmt.Printf("root=%v\n", *root) 111 fmt.Printf("locale=%v\n", *locale) 112 fmt.Printf("pattern=%v\n", *pattern) 113 fmt.Printf("fileLimit=%v\n", *fileLimit) 114 fmt.Printf("pageLimit=%v\n", *pageLimit) 115 fmt.Printf("textLimit=%v\n", *textLimit) 116 } 117 118 locales := []string{} 119 if *locale != "*" { 120 locales = strings.Split(*locale, ",") 121 } else { 122 // Sorted down by number of wiki pages 123 locales = []string{"en", "ru", "it", "de", "ro", "uk", "fa", "he", "fi", "fr", "zh", "ar", "id", "tr", "th", "vi", "lv", "lt", "hr", "az", "el", "ms", "bn", "te", "ur"} 124 // "ka", "pt" do not get downloaded properly 125 } 126 127 for _, loc := range locales { 128 localInput := filepath.Join(input, loc) 129 130 err := os.MkdirAll(localInput, os.ModePerm) 131 helpers.Check(err) 132 133 gowiki.SetLanguage(loc) 134 fileCount := 0 135 attempt := *fileLimit * 10 136 for fileCount < *fileLimit && attempt > 0 { 137 files := 0 138 if *pattern == "*" { 139 searchResult, err := gowiki.GetRandom(*pageLimit) 140 if err != nil { 141 attempt -= 1 142 fmt.Printf("Cannot download %d random pages for locale %s:\n%s\n", *pageLimit, loc, err) 143 continue 144 } 145 files = downloadLocalPagesBySections(searchResult, localInput, fileCount, *fileLimit, *textLimit) 146 } else { 147 searchResult, _, err := gowiki.Search(*pattern, *pageLimit, true) 148 helpers.Check(err) 149 files = downloadLocalPagesBySections(searchResult, localInput, fileCount, *fileLimit, *textLimit) 150 } 151 if files == 0 { 152 attempt -= 1 153 } else { 154 fileCount += files 155 } 156 } 157 if *verbose { 158 if fileCount >= *fileLimit { 159 fmt.Printf("Locale %s (%v files)\n", loc, fileCount) 160 } else if fileCount == 0 { 161 fmt.Printf("Locale %s does not containt text on %v attempts to download\n", loc, *fileLimit) 162 } else { 163 fmt.Printf("Locale %s containt less texts than %v on %v attempts to download\n", loc, *fileLimit, *fileLimit) 164 } 165 } 166 } 167 168 bridge.CleanupUnicode() 169} 170