1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouche// Package backend contains the necessary functions to classify a license. 16*46c4c49dSIbrahim Kanouchepackage backend 17*46c4c49dSIbrahim Kanouche 18*46c4c49dSIbrahim Kanoucheimport ( 19*46c4c49dSIbrahim Kanouche "context" 20*46c4c49dSIbrahim Kanouche "fmt" 21*46c4c49dSIbrahim Kanouche "io/ioutil" 22*46c4c49dSIbrahim Kanouche "log" 23*46c4c49dSIbrahim Kanouche "sync" 24*46c4c49dSIbrahim Kanouche "time" 25*46c4c49dSIbrahim Kanouche 26*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier" 27*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/commentparser" 28*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/commentparser/language" 29*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/tools/identify_license/results" 30*46c4c49dSIbrahim Kanouche) 31*46c4c49dSIbrahim Kanouche 32*46c4c49dSIbrahim Kanouche// ClassifierInterface is the interface each backend must implement. 33*46c4c49dSIbrahim Kanouchetype ClassifierInterface interface { 34*46c4c49dSIbrahim Kanouche Close() 35*46c4c49dSIbrahim Kanouche ClassifyLicenses(filenames []string, headers bool) []error 36*46c4c49dSIbrahim Kanouche ClassifyLicensesWithContext(ctx context.Context, filenames []string, headers bool) []error 37*46c4c49dSIbrahim Kanouche GetResults() results.LicenseTypes 38*46c4c49dSIbrahim Kanouche} 39*46c4c49dSIbrahim Kanouche 40*46c4c49dSIbrahim Kanouche// ClassifierBackend is an object that handles classifying a license. 41*46c4c49dSIbrahim Kanouchetype ClassifierBackend struct { 42*46c4c49dSIbrahim Kanouche results results.LicenseTypes 43*46c4c49dSIbrahim Kanouche mu sync.Mutex 44*46c4c49dSIbrahim Kanouche classifier *licenseclassifier.License 45*46c4c49dSIbrahim Kanouche} 46*46c4c49dSIbrahim Kanouche 47*46c4c49dSIbrahim Kanouche// New creates a new backend working on the local filesystem. 48*46c4c49dSIbrahim Kanouchefunc New(threshold float64, forbiddenOnly bool) (*ClassifierBackend, error) { 49*46c4c49dSIbrahim Kanouche var lc *licenseclassifier.License 50*46c4c49dSIbrahim Kanouche var err error 51*46c4c49dSIbrahim Kanouche if forbiddenOnly { 52*46c4c49dSIbrahim Kanouche lc, err = licenseclassifier.NewWithForbiddenLicenses(threshold) 53*46c4c49dSIbrahim Kanouche } else { 54*46c4c49dSIbrahim Kanouche lc, err = licenseclassifier.New(threshold) 55*46c4c49dSIbrahim Kanouche } 56*46c4c49dSIbrahim Kanouche if err != nil { 57*46c4c49dSIbrahim Kanouche return nil, err 58*46c4c49dSIbrahim Kanouche } 59*46c4c49dSIbrahim Kanouche return &ClassifierBackend{classifier: lc}, nil 60*46c4c49dSIbrahim Kanouche} 61*46c4c49dSIbrahim Kanouche 62*46c4c49dSIbrahim Kanouche// Close does nothing here since there's nothing to close. 63*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) Close() { 64*46c4c49dSIbrahim Kanouche} 65*46c4c49dSIbrahim Kanouche 66*46c4c49dSIbrahim Kanouche// ClassifyLicenses runs the license classifier over the given file. 67*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) ClassifyLicenses(filenames []string, headers bool) (errors []error) { 68*46c4c49dSIbrahim Kanouche return b.ClassifyLicensesWithContext(context.Background(), filenames, headers) 69*46c4c49dSIbrahim Kanouche} 70*46c4c49dSIbrahim Kanouche 71*46c4c49dSIbrahim Kanouche// ClassifyLicensesWithContext runs the license classifier over the given file; 72*46c4c49dSIbrahim Kanouche// ensure that it will respect the timeout and cancelation in the provided context. 73*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) ClassifyLicensesWithContext(ctx context.Context, filenames []string, headers bool) (errors []error) { 74*46c4c49dSIbrahim Kanouche 75*46c4c49dSIbrahim Kanouche files := make(chan string, len(filenames)) 76*46c4c49dSIbrahim Kanouche for _, f := range filenames { 77*46c4c49dSIbrahim Kanouche files <- f 78*46c4c49dSIbrahim Kanouche } 79*46c4c49dSIbrahim Kanouche close(files) 80*46c4c49dSIbrahim Kanouche errs := make(chan error, len(filenames)) 81*46c4c49dSIbrahim Kanouche 82*46c4c49dSIbrahim Kanouche var wg sync.WaitGroup 83*46c4c49dSIbrahim Kanouche 84*46c4c49dSIbrahim Kanouche // Create a pool from which tasks can later be started. We use a pool because the OS limits 85*46c4c49dSIbrahim Kanouche // the number of files that can be open at any one time. 86*46c4c49dSIbrahim Kanouche const numTasks = 1000 87*46c4c49dSIbrahim Kanouche wg.Add(numTasks) 88*46c4c49dSIbrahim Kanouche 89*46c4c49dSIbrahim Kanouche for i := 0; i < numTasks; i++ { 90*46c4c49dSIbrahim Kanouche go func() { 91*46c4c49dSIbrahim Kanouche // Ensure that however this function terminates, the wait group 92*46c4c49dSIbrahim Kanouche // is unblocked 93*46c4c49dSIbrahim Kanouche defer wg.Done() 94*46c4c49dSIbrahim Kanouche 95*46c4c49dSIbrahim Kanouche for { 96*46c4c49dSIbrahim Kanouche filename := <-files 97*46c4c49dSIbrahim Kanouche 98*46c4c49dSIbrahim Kanouche // no file? we're done 99*46c4c49dSIbrahim Kanouche if filename == "" { 100*46c4c49dSIbrahim Kanouche break 101*46c4c49dSIbrahim Kanouche } 102*46c4c49dSIbrahim Kanouche 103*46c4c49dSIbrahim Kanouche // If the context is done, record that the file was not 104*46c4c49dSIbrahim Kanouche // classified due to the context's termination. 105*46c4c49dSIbrahim Kanouche if err := ctx.Err(); err != nil { 106*46c4c49dSIbrahim Kanouche errs <- fmt.Errorf("file %s not classified due to context completion: %v", filename, err) 107*46c4c49dSIbrahim Kanouche continue 108*46c4c49dSIbrahim Kanouche } 109*46c4c49dSIbrahim Kanouche 110*46c4c49dSIbrahim Kanouche if err := b.classifyLicense(filename, headers); err != nil { 111*46c4c49dSIbrahim Kanouche errs <- err 112*46c4c49dSIbrahim Kanouche } 113*46c4c49dSIbrahim Kanouche } 114*46c4c49dSIbrahim Kanouche }() 115*46c4c49dSIbrahim Kanouche } 116*46c4c49dSIbrahim Kanouche 117*46c4c49dSIbrahim Kanouche wg.Wait() 118*46c4c49dSIbrahim Kanouche close(errs) 119*46c4c49dSIbrahim Kanouche 120*46c4c49dSIbrahim Kanouche for err := range errs { 121*46c4c49dSIbrahim Kanouche errors = append(errors, err) 122*46c4c49dSIbrahim Kanouche } 123*46c4c49dSIbrahim Kanouche return errors 124*46c4c49dSIbrahim Kanouche} 125*46c4c49dSIbrahim Kanouche 126*46c4c49dSIbrahim Kanouche// classifyLicense is called by a Go-function to perform the actual 127*46c4c49dSIbrahim Kanouche// classification of a license. 128*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) classifyLicense(filename string, headers bool) error { 129*46c4c49dSIbrahim Kanouche contents, err := ioutil.ReadFile(filename) 130*46c4c49dSIbrahim Kanouche if err != nil { 131*46c4c49dSIbrahim Kanouche return fmt.Errorf("unable to read %q: %v", filename, err) 132*46c4c49dSIbrahim Kanouche } 133*46c4c49dSIbrahim Kanouche 134*46c4c49dSIbrahim Kanouche matchLoop := func(contents string) { 135*46c4c49dSIbrahim Kanouche for _, m := range b.classifier.MultipleMatch(contents, headers) { 136*46c4c49dSIbrahim Kanouche b.mu.Lock() 137*46c4c49dSIbrahim Kanouche b.results = append(b.results, &results.LicenseType{ 138*46c4c49dSIbrahim Kanouche Filename: filename, 139*46c4c49dSIbrahim Kanouche Name: m.Name, 140*46c4c49dSIbrahim Kanouche Confidence: m.Confidence, 141*46c4c49dSIbrahim Kanouche Offset: m.Offset, 142*46c4c49dSIbrahim Kanouche Extent: m.Extent, 143*46c4c49dSIbrahim Kanouche }) 144*46c4c49dSIbrahim Kanouche b.mu.Unlock() 145*46c4c49dSIbrahim Kanouche } 146*46c4c49dSIbrahim Kanouche } 147*46c4c49dSIbrahim Kanouche 148*46c4c49dSIbrahim Kanouche log.Printf("Classifying license(s): %s", filename) 149*46c4c49dSIbrahim Kanouche start := time.Now() 150*46c4c49dSIbrahim Kanouche if lang := language.ClassifyLanguage(filename); lang == language.Unknown { 151*46c4c49dSIbrahim Kanouche matchLoop(string(contents)) 152*46c4c49dSIbrahim Kanouche } else { 153*46c4c49dSIbrahim Kanouche log.Printf("detected language: %v", lang) 154*46c4c49dSIbrahim Kanouche comments := commentparser.Parse(contents, lang) 155*46c4c49dSIbrahim Kanouche for ch := range comments.ChunkIterator() { 156*46c4c49dSIbrahim Kanouche matchLoop(ch.String()) 157*46c4c49dSIbrahim Kanouche } 158*46c4c49dSIbrahim Kanouche } 159*46c4c49dSIbrahim Kanouche log.Printf("Finished Classifying License %q: %v", filename, time.Since(start)) 160*46c4c49dSIbrahim Kanouche return nil 161*46c4c49dSIbrahim Kanouche} 162*46c4c49dSIbrahim Kanouche 163*46c4c49dSIbrahim Kanouche// GetResults returns the results of the classifications. 164*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) GetResults() results.LicenseTypes { 165*46c4c49dSIbrahim Kanouche return b.results 166*46c4c49dSIbrahim Kanouche} 167