xref: /aosp_15_r20/external/licenseclassifier/serializer/serializer.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc.
2*46c4c49dSIbrahim Kanouche//
3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License");
4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License.
5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at
6*46c4c49dSIbrahim Kanouche//
7*46c4c49dSIbrahim Kanouche//     http://www.apache.org/licenses/LICENSE-2.0
8*46c4c49dSIbrahim Kanouche//
9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software
10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS,
11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and
13*46c4c49dSIbrahim Kanouche// limitations under the License.
14*46c4c49dSIbrahim Kanouche
15*46c4c49dSIbrahim Kanouche// Package serializer normalizes the license text and calculates the hash
16*46c4c49dSIbrahim Kanouche// values for all substrings in the license. It then outputs the normalized
17*46c4c49dSIbrahim Kanouche// text and hashes to disk in a compressed archive.
18*46c4c49dSIbrahim Kanouchepackage serializer
19*46c4c49dSIbrahim Kanouche
20*46c4c49dSIbrahim Kanoucheimport (
21*46c4c49dSIbrahim Kanouche	"archive/tar"
22*46c4c49dSIbrahim Kanouche	"bytes"
23*46c4c49dSIbrahim Kanouche	"compress/gzip"
24*46c4c49dSIbrahim Kanouche	"io"
25*46c4c49dSIbrahim Kanouche	"log"
26*46c4c49dSIbrahim Kanouche	"path/filepath"
27*46c4c49dSIbrahim Kanouche	"strings"
28*46c4c49dSIbrahim Kanouche
29*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier"
30*46c4c49dSIbrahim Kanouche	"github.com/google/licenseclassifier/stringclassifier/searchset"
31*46c4c49dSIbrahim Kanouche)
32*46c4c49dSIbrahim Kanouche
33*46c4c49dSIbrahim Kanouche// ArchiveLicenses takes all of the known license texts, normalizes them, then
34*46c4c49dSIbrahim Kanouche// calculates the hash values of all substrings. The resulting normalized text
35*46c4c49dSIbrahim Kanouche// and hashed substring values are then serialized into an archive file.
36*46c4c49dSIbrahim Kanouchefunc ArchiveLicenses(licenses []string, w io.Writer) error {
37*46c4c49dSIbrahim Kanouche	gw := gzip.NewWriter(w)
38*46c4c49dSIbrahim Kanouche	defer gw.Close()
39*46c4c49dSIbrahim Kanouche
40*46c4c49dSIbrahim Kanouche	tw := tar.NewWriter(gw)
41*46c4c49dSIbrahim Kanouche	for _, license := range licenses {
42*46c4c49dSIbrahim Kanouche		// All license files have a ".txt" extension.
43*46c4c49dSIbrahim Kanouche		ext := filepath.Ext(license)
44*46c4c49dSIbrahim Kanouche		if ext != ".txt" {
45*46c4c49dSIbrahim Kanouche			continue
46*46c4c49dSIbrahim Kanouche		}
47*46c4c49dSIbrahim Kanouche
48*46c4c49dSIbrahim Kanouche		contents, err := licenseclassifier.ReadLicenseFile(license)
49*46c4c49dSIbrahim Kanouche		if err != nil {
50*46c4c49dSIbrahim Kanouche			return err
51*46c4c49dSIbrahim Kanouche		}
52*46c4c49dSIbrahim Kanouche
53*46c4c49dSIbrahim Kanouche		str := licenseclassifier.TrimExtraneousTrailingText(string(contents))
54*46c4c49dSIbrahim Kanouche		for _, n := range licenseclassifier.Normalizers {
55*46c4c49dSIbrahim Kanouche			str = n(str)
56*46c4c49dSIbrahim Kanouche		}
57*46c4c49dSIbrahim Kanouche
58*46c4c49dSIbrahim Kanouche		baseName := strings.TrimSuffix(filepath.Base(license), ext)
59*46c4c49dSIbrahim Kanouche
60*46c4c49dSIbrahim Kanouche		// Serialize the normalized license text.
61*46c4c49dSIbrahim Kanouche		log.Printf("Serializing %q", baseName)
62*46c4c49dSIbrahim Kanouche		hdr := &tar.Header{
63*46c4c49dSIbrahim Kanouche			Name: filepath.Base(license),
64*46c4c49dSIbrahim Kanouche			Mode: 0644,
65*46c4c49dSIbrahim Kanouche			Size: int64(len(str)),
66*46c4c49dSIbrahim Kanouche		}
67*46c4c49dSIbrahim Kanouche
68*46c4c49dSIbrahim Kanouche		if err := tw.WriteHeader(hdr); err != nil {
69*46c4c49dSIbrahim Kanouche			return err
70*46c4c49dSIbrahim Kanouche		}
71*46c4c49dSIbrahim Kanouche		if _, err := tw.Write([]byte(str)); err != nil {
72*46c4c49dSIbrahim Kanouche			return err
73*46c4c49dSIbrahim Kanouche		}
74*46c4c49dSIbrahim Kanouche
75*46c4c49dSIbrahim Kanouche		// Calculate the substrings' checksums
76*46c4c49dSIbrahim Kanouche		set := searchset.New(str, searchset.DefaultGranularity)
77*46c4c49dSIbrahim Kanouche
78*46c4c49dSIbrahim Kanouche		var s bytes.Buffer
79*46c4c49dSIbrahim Kanouche		if err := set.Serialize(&s); err != nil {
80*46c4c49dSIbrahim Kanouche			return err
81*46c4c49dSIbrahim Kanouche		}
82*46c4c49dSIbrahim Kanouche
83*46c4c49dSIbrahim Kanouche		// Serialize the checksums.
84*46c4c49dSIbrahim Kanouche		hdr = &tar.Header{
85*46c4c49dSIbrahim Kanouche			Name: baseName + ".hash",
86*46c4c49dSIbrahim Kanouche			Mode: 0644,
87*46c4c49dSIbrahim Kanouche			Size: int64(s.Len()),
88*46c4c49dSIbrahim Kanouche		}
89*46c4c49dSIbrahim Kanouche
90*46c4c49dSIbrahim Kanouche		if err := tw.WriteHeader(hdr); err != nil {
91*46c4c49dSIbrahim Kanouche			return err
92*46c4c49dSIbrahim Kanouche		}
93*46c4c49dSIbrahim Kanouche		if _, err := tw.Write(s.Bytes()); err != nil {
94*46c4c49dSIbrahim Kanouche			return err
95*46c4c49dSIbrahim Kanouche		}
96*46c4c49dSIbrahim Kanouche	}
97*46c4c49dSIbrahim Kanouche
98*46c4c49dSIbrahim Kanouche	return tw.Close()
99*46c4c49dSIbrahim Kanouche}
100