1*46c4c49dSIbrahim Kanouche// Copyright 2017 Google Inc. 2*46c4c49dSIbrahim Kanouche// 3*46c4c49dSIbrahim Kanouche// Licensed under the Apache License, Version 2.0 (the "License"); 4*46c4c49dSIbrahim Kanouche// you may not use this file except in compliance with the License. 5*46c4c49dSIbrahim Kanouche// You may obtain a copy of the License at 6*46c4c49dSIbrahim Kanouche// 7*46c4c49dSIbrahim Kanouche// http://www.apache.org/licenses/LICENSE-2.0 8*46c4c49dSIbrahim Kanouche// 9*46c4c49dSIbrahim Kanouche// Unless required by applicable law or agreed to in writing, software 10*46c4c49dSIbrahim Kanouche// distributed under the License is distributed on an "AS IS" BASIS, 11*46c4c49dSIbrahim Kanouche// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. 12*46c4c49dSIbrahim Kanouche// See the License for the specific language governing permissions and 13*46c4c49dSIbrahim Kanouche// limitations under the License. 14*46c4c49dSIbrahim Kanouche 15*46c4c49dSIbrahim Kanouche// Package serializer normalizes the license text and calculates the hash 16*46c4c49dSIbrahim Kanouche// values for all substrings in the license. It then outputs the normalized 17*46c4c49dSIbrahim Kanouche// text and hashes to disk in a compressed archive. 18*46c4c49dSIbrahim Kanouchepackage serializer 19*46c4c49dSIbrahim Kanouche 20*46c4c49dSIbrahim Kanoucheimport ( 21*46c4c49dSIbrahim Kanouche "archive/tar" 22*46c4c49dSIbrahim Kanouche "bytes" 23*46c4c49dSIbrahim Kanouche "compress/gzip" 24*46c4c49dSIbrahim Kanouche "io" 25*46c4c49dSIbrahim Kanouche "log" 26*46c4c49dSIbrahim Kanouche "path/filepath" 27*46c4c49dSIbrahim Kanouche "strings" 28*46c4c49dSIbrahim Kanouche 29*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier" 30*46c4c49dSIbrahim Kanouche "github.com/google/licenseclassifier/stringclassifier/searchset" 31*46c4c49dSIbrahim Kanouche) 32*46c4c49dSIbrahim Kanouche 33*46c4c49dSIbrahim Kanouche// ArchiveLicenses takes all of the known license texts, normalizes them, then 34*46c4c49dSIbrahim Kanouche// calculates the hash values of all substrings. The resulting normalized text 35*46c4c49dSIbrahim Kanouche// and hashed substring values are then serialized into an archive file. 36*46c4c49dSIbrahim Kanouchefunc ArchiveLicenses(licenses []string, w io.Writer) error { 37*46c4c49dSIbrahim Kanouche gw := gzip.NewWriter(w) 38*46c4c49dSIbrahim Kanouche defer gw.Close() 39*46c4c49dSIbrahim Kanouche 40*46c4c49dSIbrahim Kanouche tw := tar.NewWriter(gw) 41*46c4c49dSIbrahim Kanouche for _, license := range licenses { 42*46c4c49dSIbrahim Kanouche // All license files have a ".txt" extension. 43*46c4c49dSIbrahim Kanouche ext := filepath.Ext(license) 44*46c4c49dSIbrahim Kanouche if ext != ".txt" { 45*46c4c49dSIbrahim Kanouche continue 46*46c4c49dSIbrahim Kanouche } 47*46c4c49dSIbrahim Kanouche 48*46c4c49dSIbrahim Kanouche contents, err := licenseclassifier.ReadLicenseFile(license) 49*46c4c49dSIbrahim Kanouche if err != nil { 50*46c4c49dSIbrahim Kanouche return err 51*46c4c49dSIbrahim Kanouche } 52*46c4c49dSIbrahim Kanouche 53*46c4c49dSIbrahim Kanouche str := licenseclassifier.TrimExtraneousTrailingText(string(contents)) 54*46c4c49dSIbrahim Kanouche for _, n := range licenseclassifier.Normalizers { 55*46c4c49dSIbrahim Kanouche str = n(str) 56*46c4c49dSIbrahim Kanouche } 57*46c4c49dSIbrahim Kanouche 58*46c4c49dSIbrahim Kanouche baseName := strings.TrimSuffix(filepath.Base(license), ext) 59*46c4c49dSIbrahim Kanouche 60*46c4c49dSIbrahim Kanouche // Serialize the normalized license text. 61*46c4c49dSIbrahim Kanouche log.Printf("Serializing %q", baseName) 62*46c4c49dSIbrahim Kanouche hdr := &tar.Header{ 63*46c4c49dSIbrahim Kanouche Name: filepath.Base(license), 64*46c4c49dSIbrahim Kanouche Mode: 0644, 65*46c4c49dSIbrahim Kanouche Size: int64(len(str)), 66*46c4c49dSIbrahim Kanouche } 67*46c4c49dSIbrahim Kanouche 68*46c4c49dSIbrahim Kanouche if err := tw.WriteHeader(hdr); err != nil { 69*46c4c49dSIbrahim Kanouche return err 70*46c4c49dSIbrahim Kanouche } 71*46c4c49dSIbrahim Kanouche if _, err := tw.Write([]byte(str)); err != nil { 72*46c4c49dSIbrahim Kanouche return err 73*46c4c49dSIbrahim Kanouche } 74*46c4c49dSIbrahim Kanouche 75*46c4c49dSIbrahim Kanouche // Calculate the substrings' checksums 76*46c4c49dSIbrahim Kanouche set := searchset.New(str, searchset.DefaultGranularity) 77*46c4c49dSIbrahim Kanouche 78*46c4c49dSIbrahim Kanouche var s bytes.Buffer 79*46c4c49dSIbrahim Kanouche if err := set.Serialize(&s); err != nil { 80*46c4c49dSIbrahim Kanouche return err 81*46c4c49dSIbrahim Kanouche } 82*46c4c49dSIbrahim Kanouche 83*46c4c49dSIbrahim Kanouche // Serialize the checksums. 84*46c4c49dSIbrahim Kanouche hdr = &tar.Header{ 85*46c4c49dSIbrahim Kanouche Name: baseName + ".hash", 86*46c4c49dSIbrahim Kanouche Mode: 0644, 87*46c4c49dSIbrahim Kanouche Size: int64(s.Len()), 88*46c4c49dSIbrahim Kanouche } 89*46c4c49dSIbrahim Kanouche 90*46c4c49dSIbrahim Kanouche if err := tw.WriteHeader(hdr); err != nil { 91*46c4c49dSIbrahim Kanouche return err 92*46c4c49dSIbrahim Kanouche } 93*46c4c49dSIbrahim Kanouche if _, err := tw.Write(s.Bytes()); err != nil { 94*46c4c49dSIbrahim Kanouche return err 95*46c4c49dSIbrahim Kanouche } 96*46c4c49dSIbrahim Kanouche } 97*46c4c49dSIbrahim Kanouche 98*46c4c49dSIbrahim Kanouche return tw.Close() 99*46c4c49dSIbrahim Kanouche} 100