xref: /aosp_15_r20/external/licenseclassifier/tools/identify_license/backend/backend.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouche// Package backend contains the necessary functions to classify a license.
16*46c4c49dSIbrahim Kanouchepackage backend
17*46c4c49dSIbrahim Kanouche
18*46c4c49dSIbrahim Kanoucheimport (
19*46c4c49dSIbrahim Kanouche	"context"
20*46c4c49dSIbrahim Kanouche	"fmt"
21*46c4c49dSIbrahim Kanouche	"io/ioutil"
22*46c4c49dSIbrahim Kanouche	"log"
23*46c4c49dSIbrahim Kanouche	"sync"
24*46c4c49dSIbrahim Kanouche	"time"
25*46c4c49dSIbrahim Kanouche
26*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier"
27*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/commentparser"
28*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/commentparser/language"
29*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/tools/identify_license/results"
30*46c4c49dSIbrahim Kanouche)
31*46c4c49dSIbrahim Kanouche
32*46c4c49dSIbrahim Kanouche// ClassifierInterface is the interface each backend must implement.
33*46c4c49dSIbrahim Kanouchetype ClassifierInterface interface {
34*46c4c49dSIbrahim Kanouche	Close()
35*46c4c49dSIbrahim Kanouche	ClassifyLicenses(filenames []string, headers bool) []error
36*46c4c49dSIbrahim Kanouche	ClassifyLicensesWithContext(ctx context.Context, filenames []string, headers bool) []error
37*46c4c49dSIbrahim Kanouche	GetResults() results.LicenseTypes
38*46c4c49dSIbrahim Kanouche}
39*46c4c49dSIbrahim Kanouche
40*46c4c49dSIbrahim Kanouche// ClassifierBackend is an object that handles classifying a license.
41*46c4c49dSIbrahim Kanouchetype ClassifierBackend struct {
42*46c4c49dSIbrahim Kanouche	results    results.LicenseTypes
43*46c4c49dSIbrahim Kanouche	mu         sync.Mutex
44*46c4c49dSIbrahim Kanouche	classifier *licenseclassifier.License
45*46c4c49dSIbrahim Kanouche}
46*46c4c49dSIbrahim Kanouche
47*46c4c49dSIbrahim Kanouche// New creates a new backend working on the local filesystem.
48*46c4c49dSIbrahim Kanouchefunc New(threshold float64, forbiddenOnly bool) (*ClassifierBackend, error) {
49*46c4c49dSIbrahim Kanouche	var lc *licenseclassifier.License
50*46c4c49dSIbrahim Kanouche	var err error
51*46c4c49dSIbrahim Kanouche	if forbiddenOnly {
52*46c4c49dSIbrahim Kanouche		lc, err = licenseclassifier.NewWithForbiddenLicenses(threshold)
53*46c4c49dSIbrahim Kanouche	} else {
54*46c4c49dSIbrahim Kanouche		lc, err = licenseclassifier.New(threshold)
55*46c4c49dSIbrahim Kanouche	}
56*46c4c49dSIbrahim Kanouche	if err != nil {
57*46c4c49dSIbrahim Kanouche		return nil, err
58*46c4c49dSIbrahim Kanouche	}
59*46c4c49dSIbrahim Kanouche	return &ClassifierBackend{classifier: lc}, nil
60*46c4c49dSIbrahim Kanouche}
61*46c4c49dSIbrahim Kanouche
62*46c4c49dSIbrahim Kanouche// Close does nothing here since there's nothing to close.
63*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) Close() {
64*46c4c49dSIbrahim Kanouche}
65*46c4c49dSIbrahim Kanouche
66*46c4c49dSIbrahim Kanouche// ClassifyLicenses runs the license classifier over the given file.
67*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) ClassifyLicenses(filenames []string, headers bool) (errors []error) {
68*46c4c49dSIbrahim Kanouche	return b.ClassifyLicensesWithContext(context.Background(), filenames, headers)
69*46c4c49dSIbrahim Kanouche}
70*46c4c49dSIbrahim Kanouche
71*46c4c49dSIbrahim Kanouche// ClassifyLicensesWithContext runs the license classifier over the given file;
72*46c4c49dSIbrahim Kanouche// ensure that it will respect the timeout and cancelation in the provided context.
73*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) ClassifyLicensesWithContext(ctx context.Context, filenames []string, headers bool) (errors []error) {
74*46c4c49dSIbrahim Kanouche
75*46c4c49dSIbrahim Kanouche	files := make(chan string, len(filenames))
76*46c4c49dSIbrahim Kanouche	for _, f := range filenames {
77*46c4c49dSIbrahim Kanouche		files <- f
78*46c4c49dSIbrahim Kanouche	}
79*46c4c49dSIbrahim Kanouche	close(files)
80*46c4c49dSIbrahim Kanouche	errs := make(chan error, len(filenames))
81*46c4c49dSIbrahim Kanouche
82*46c4c49dSIbrahim Kanouche	var wg sync.WaitGroup
83*46c4c49dSIbrahim Kanouche
84*46c4c49dSIbrahim Kanouche	// Create a pool from which tasks can later be started. We use a pool because the OS limits
85*46c4c49dSIbrahim Kanouche	// the number of files that can be open at any one time.
86*46c4c49dSIbrahim Kanouche	const numTasks = 1000
87*46c4c49dSIbrahim Kanouche	wg.Add(numTasks)
88*46c4c49dSIbrahim Kanouche
89*46c4c49dSIbrahim Kanouche	for i := 0; i < numTasks; i++ {
90*46c4c49dSIbrahim Kanouche		go func() {
91*46c4c49dSIbrahim Kanouche			// Ensure that however this function terminates, the wait group
92*46c4c49dSIbrahim Kanouche			// is unblocked
93*46c4c49dSIbrahim Kanouche			defer wg.Done()
94*46c4c49dSIbrahim Kanouche
95*46c4c49dSIbrahim Kanouche			for {
96*46c4c49dSIbrahim Kanouche				filename := <-files
97*46c4c49dSIbrahim Kanouche
98*46c4c49dSIbrahim Kanouche				// no file? we're done
99*46c4c49dSIbrahim Kanouche				if filename == "" {
100*46c4c49dSIbrahim Kanouche					break
101*46c4c49dSIbrahim Kanouche				}
102*46c4c49dSIbrahim Kanouche
103*46c4c49dSIbrahim Kanouche				// If the context is done, record that the file was not
104*46c4c49dSIbrahim Kanouche				// classified due to the context's termination.
105*46c4c49dSIbrahim Kanouche				if err := ctx.Err(); err != nil {
106*46c4c49dSIbrahim Kanouche					errs <- fmt.Errorf("file %s not classified due to context completion: %v", filename, err)
107*46c4c49dSIbrahim Kanouche					continue
108*46c4c49dSIbrahim Kanouche				}
109*46c4c49dSIbrahim Kanouche
110*46c4c49dSIbrahim Kanouche				if err := b.classifyLicense(filename, headers); err != nil {
111*46c4c49dSIbrahim Kanouche					errs <- err
112*46c4c49dSIbrahim Kanouche				}
113*46c4c49dSIbrahim Kanouche			}
114*46c4c49dSIbrahim Kanouche		}()
115*46c4c49dSIbrahim Kanouche	}
116*46c4c49dSIbrahim Kanouche
117*46c4c49dSIbrahim Kanouche	wg.Wait()
118*46c4c49dSIbrahim Kanouche	close(errs)
119*46c4c49dSIbrahim Kanouche
120*46c4c49dSIbrahim Kanouche	for err := range errs {
121*46c4c49dSIbrahim Kanouche		errors = append(errors, err)
122*46c4c49dSIbrahim Kanouche	}
123*46c4c49dSIbrahim Kanouche	return errors
124*46c4c49dSIbrahim Kanouche}
125*46c4c49dSIbrahim Kanouche
126*46c4c49dSIbrahim Kanouche// classifyLicense is called by a Go-function to perform the actual
127*46c4c49dSIbrahim Kanouche// classification of a license.
128*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) classifyLicense(filename string, headers bool) error {
129*46c4c49dSIbrahim Kanouche	contents, err := ioutil.ReadFile(filename)
130*46c4c49dSIbrahim Kanouche	if err != nil {
131*46c4c49dSIbrahim Kanouche		return fmt.Errorf("unable to read %q: %v", filename, err)
132*46c4c49dSIbrahim Kanouche	}
133*46c4c49dSIbrahim Kanouche
134*46c4c49dSIbrahim Kanouche	matchLoop := func(contents string) {
135*46c4c49dSIbrahim Kanouche		for _, m := range b.classifier.MultipleMatch(contents, headers) {
136*46c4c49dSIbrahim Kanouche			b.mu.Lock()
137*46c4c49dSIbrahim Kanouche			b.results = append(b.results, &results.LicenseType{
138*46c4c49dSIbrahim Kanouche				Filename:   filename,
139*46c4c49dSIbrahim Kanouche				Name:       m.Name,
140*46c4c49dSIbrahim Kanouche				Confidence: m.Confidence,
141*46c4c49dSIbrahim Kanouche				Offset:     m.Offset,
142*46c4c49dSIbrahim Kanouche				Extent:     m.Extent,
143*46c4c49dSIbrahim Kanouche			})
144*46c4c49dSIbrahim Kanouche			b.mu.Unlock()
145*46c4c49dSIbrahim Kanouche		}
146*46c4c49dSIbrahim Kanouche	}
147*46c4c49dSIbrahim Kanouche
148*46c4c49dSIbrahim Kanouche	log.Printf("Classifying license(s): %s", filename)
149*46c4c49dSIbrahim Kanouche	start := time.Now()
150*46c4c49dSIbrahim Kanouche	if lang := language.ClassifyLanguage(filename); lang == language.Unknown {
151*46c4c49dSIbrahim Kanouche		matchLoop(string(contents))
152*46c4c49dSIbrahim Kanouche	} else {
153*46c4c49dSIbrahim Kanouche		log.Printf("detected language: %v", lang)
154*46c4c49dSIbrahim Kanouche		comments := commentparser.Parse(contents, lang)
155*46c4c49dSIbrahim Kanouche		for ch := range comments.ChunkIterator() {
156*46c4c49dSIbrahim Kanouche			matchLoop(ch.String())
157*46c4c49dSIbrahim Kanouche		}
158*46c4c49dSIbrahim Kanouche	}
159*46c4c49dSIbrahim Kanouche	log.Printf("Finished Classifying License %q: %v", filename, time.Since(start))
160*46c4c49dSIbrahim Kanouche	return nil
161*46c4c49dSIbrahim Kanouche}
162*46c4c49dSIbrahim Kanouche
163*46c4c49dSIbrahim Kanouche// GetResults returns the results of the classifications.
164*46c4c49dSIbrahim Kanouchefunc (b *ClassifierBackend) GetResults() results.LicenseTypes {
165*46c4c49dSIbrahim Kanouche	return b.results
166*46c4c49dSIbrahim Kanouche}
167