1*1c12ee1eSDan Willemsen// Copyright 2019 The Go Authors. All rights reserved. 2*1c12ee1eSDan Willemsen// Use of this source code is governed by a BSD-style 3*1c12ee1eSDan Willemsen// license that can be found in the LICENSE file. 4*1c12ee1eSDan Willemsen 5*1c12ee1eSDan Willemsen// Package strs provides string manipulation functionality specific to protobuf. 6*1c12ee1eSDan Willemsenpackage strs 7*1c12ee1eSDan Willemsen 8*1c12ee1eSDan Willemsenimport ( 9*1c12ee1eSDan Willemsen "go/token" 10*1c12ee1eSDan Willemsen "strings" 11*1c12ee1eSDan Willemsen "unicode" 12*1c12ee1eSDan Willemsen "unicode/utf8" 13*1c12ee1eSDan Willemsen 14*1c12ee1eSDan Willemsen "google.golang.org/protobuf/internal/flags" 15*1c12ee1eSDan Willemsen "google.golang.org/protobuf/reflect/protoreflect" 16*1c12ee1eSDan Willemsen) 17*1c12ee1eSDan Willemsen 18*1c12ee1eSDan Willemsen// EnforceUTF8 reports whether to enforce strict UTF-8 validation. 19*1c12ee1eSDan Willemsenfunc EnforceUTF8(fd protoreflect.FieldDescriptor) bool { 20*1c12ee1eSDan Willemsen if flags.ProtoLegacy { 21*1c12ee1eSDan Willemsen if fd, ok := fd.(interface{ EnforceUTF8() bool }); ok { 22*1c12ee1eSDan Willemsen return fd.EnforceUTF8() 23*1c12ee1eSDan Willemsen } 24*1c12ee1eSDan Willemsen } 25*1c12ee1eSDan Willemsen return fd.Syntax() == protoreflect.Proto3 26*1c12ee1eSDan Willemsen} 27*1c12ee1eSDan Willemsen 28*1c12ee1eSDan Willemsen// GoCamelCase camel-cases a protobuf name for use as a Go identifier. 29*1c12ee1eSDan Willemsen// 30*1c12ee1eSDan Willemsen// If there is an interior underscore followed by a lower case letter, 31*1c12ee1eSDan Willemsen// drop the underscore and convert the letter to upper case. 32*1c12ee1eSDan Willemsenfunc GoCamelCase(s string) string { 33*1c12ee1eSDan Willemsen // Invariant: if the next letter is lower case, it must be converted 34*1c12ee1eSDan Willemsen // to upper case. 35*1c12ee1eSDan Willemsen // That is, we process a word at a time, where words are marked by _ or 36*1c12ee1eSDan Willemsen // upper case letter. Digits are treated as words. 37*1c12ee1eSDan Willemsen var b []byte 38*1c12ee1eSDan Willemsen for i := 0; i < len(s); i++ { 39*1c12ee1eSDan Willemsen c := s[i] 40*1c12ee1eSDan Willemsen switch { 41*1c12ee1eSDan Willemsen case c == '.' && i+1 < len(s) && isASCIILower(s[i+1]): 42*1c12ee1eSDan Willemsen // Skip over '.' in ".{{lowercase}}". 43*1c12ee1eSDan Willemsen case c == '.': 44*1c12ee1eSDan Willemsen b = append(b, '_') // convert '.' to '_' 45*1c12ee1eSDan Willemsen case c == '_' && (i == 0 || s[i-1] == '.'): 46*1c12ee1eSDan Willemsen // Convert initial '_' to ensure we start with a capital letter. 47*1c12ee1eSDan Willemsen // Do the same for '_' after '.' to match historic behavior. 48*1c12ee1eSDan Willemsen b = append(b, 'X') // convert '_' to 'X' 49*1c12ee1eSDan Willemsen case c == '_' && i+1 < len(s) && isASCIILower(s[i+1]): 50*1c12ee1eSDan Willemsen // Skip over '_' in "_{{lowercase}}". 51*1c12ee1eSDan Willemsen case isASCIIDigit(c): 52*1c12ee1eSDan Willemsen b = append(b, c) 53*1c12ee1eSDan Willemsen default: 54*1c12ee1eSDan Willemsen // Assume we have a letter now - if not, it's a bogus identifier. 55*1c12ee1eSDan Willemsen // The next word is a sequence of characters that must start upper case. 56*1c12ee1eSDan Willemsen if isASCIILower(c) { 57*1c12ee1eSDan Willemsen c -= 'a' - 'A' // convert lowercase to uppercase 58*1c12ee1eSDan Willemsen } 59*1c12ee1eSDan Willemsen b = append(b, c) 60*1c12ee1eSDan Willemsen 61*1c12ee1eSDan Willemsen // Accept lower case sequence that follows. 62*1c12ee1eSDan Willemsen for ; i+1 < len(s) && isASCIILower(s[i+1]); i++ { 63*1c12ee1eSDan Willemsen b = append(b, s[i+1]) 64*1c12ee1eSDan Willemsen } 65*1c12ee1eSDan Willemsen } 66*1c12ee1eSDan Willemsen } 67*1c12ee1eSDan Willemsen return string(b) 68*1c12ee1eSDan Willemsen} 69*1c12ee1eSDan Willemsen 70*1c12ee1eSDan Willemsen// GoSanitized converts a string to a valid Go identifier. 71*1c12ee1eSDan Willemsenfunc GoSanitized(s string) string { 72*1c12ee1eSDan Willemsen // Sanitize the input to the set of valid characters, 73*1c12ee1eSDan Willemsen // which must be '_' or be in the Unicode L or N categories. 74*1c12ee1eSDan Willemsen s = strings.Map(func(r rune) rune { 75*1c12ee1eSDan Willemsen if unicode.IsLetter(r) || unicode.IsDigit(r) { 76*1c12ee1eSDan Willemsen return r 77*1c12ee1eSDan Willemsen } 78*1c12ee1eSDan Willemsen return '_' 79*1c12ee1eSDan Willemsen }, s) 80*1c12ee1eSDan Willemsen 81*1c12ee1eSDan Willemsen // Prepend '_' in the event of a Go keyword conflict or if 82*1c12ee1eSDan Willemsen // the identifier is invalid (does not start in the Unicode L category). 83*1c12ee1eSDan Willemsen r, _ := utf8.DecodeRuneInString(s) 84*1c12ee1eSDan Willemsen if token.Lookup(s).IsKeyword() || !unicode.IsLetter(r) { 85*1c12ee1eSDan Willemsen return "_" + s 86*1c12ee1eSDan Willemsen } 87*1c12ee1eSDan Willemsen return s 88*1c12ee1eSDan Willemsen} 89*1c12ee1eSDan Willemsen 90*1c12ee1eSDan Willemsen// JSONCamelCase converts a snake_case identifier to a camelCase identifier, 91*1c12ee1eSDan Willemsen// according to the protobuf JSON specification. 92*1c12ee1eSDan Willemsenfunc JSONCamelCase(s string) string { 93*1c12ee1eSDan Willemsen var b []byte 94*1c12ee1eSDan Willemsen var wasUnderscore bool 95*1c12ee1eSDan Willemsen for i := 0; i < len(s); i++ { // proto identifiers are always ASCII 96*1c12ee1eSDan Willemsen c := s[i] 97*1c12ee1eSDan Willemsen if c != '_' { 98*1c12ee1eSDan Willemsen if wasUnderscore && isASCIILower(c) { 99*1c12ee1eSDan Willemsen c -= 'a' - 'A' // convert to uppercase 100*1c12ee1eSDan Willemsen } 101*1c12ee1eSDan Willemsen b = append(b, c) 102*1c12ee1eSDan Willemsen } 103*1c12ee1eSDan Willemsen wasUnderscore = c == '_' 104*1c12ee1eSDan Willemsen } 105*1c12ee1eSDan Willemsen return string(b) 106*1c12ee1eSDan Willemsen} 107*1c12ee1eSDan Willemsen 108*1c12ee1eSDan Willemsen// JSONSnakeCase converts a camelCase identifier to a snake_case identifier, 109*1c12ee1eSDan Willemsen// according to the protobuf JSON specification. 110*1c12ee1eSDan Willemsenfunc JSONSnakeCase(s string) string { 111*1c12ee1eSDan Willemsen var b []byte 112*1c12ee1eSDan Willemsen for i := 0; i < len(s); i++ { // proto identifiers are always ASCII 113*1c12ee1eSDan Willemsen c := s[i] 114*1c12ee1eSDan Willemsen if isASCIIUpper(c) { 115*1c12ee1eSDan Willemsen b = append(b, '_') 116*1c12ee1eSDan Willemsen c += 'a' - 'A' // convert to lowercase 117*1c12ee1eSDan Willemsen } 118*1c12ee1eSDan Willemsen b = append(b, c) 119*1c12ee1eSDan Willemsen } 120*1c12ee1eSDan Willemsen return string(b) 121*1c12ee1eSDan Willemsen} 122*1c12ee1eSDan Willemsen 123*1c12ee1eSDan Willemsen// MapEntryName derives the name of the map entry message given the field name. 124*1c12ee1eSDan Willemsen// See protoc v3.8.0: src/google/protobuf/descriptor.cc:254-276,6057 125*1c12ee1eSDan Willemsenfunc MapEntryName(s string) string { 126*1c12ee1eSDan Willemsen var b []byte 127*1c12ee1eSDan Willemsen upperNext := true 128*1c12ee1eSDan Willemsen for _, c := range s { 129*1c12ee1eSDan Willemsen switch { 130*1c12ee1eSDan Willemsen case c == '_': 131*1c12ee1eSDan Willemsen upperNext = true 132*1c12ee1eSDan Willemsen case upperNext: 133*1c12ee1eSDan Willemsen b = append(b, byte(unicode.ToUpper(c))) 134*1c12ee1eSDan Willemsen upperNext = false 135*1c12ee1eSDan Willemsen default: 136*1c12ee1eSDan Willemsen b = append(b, byte(c)) 137*1c12ee1eSDan Willemsen } 138*1c12ee1eSDan Willemsen } 139*1c12ee1eSDan Willemsen b = append(b, "Entry"...) 140*1c12ee1eSDan Willemsen return string(b) 141*1c12ee1eSDan Willemsen} 142*1c12ee1eSDan Willemsen 143*1c12ee1eSDan Willemsen// EnumValueName derives the camel-cased enum value name. 144*1c12ee1eSDan Willemsen// See protoc v3.8.0: src/google/protobuf/descriptor.cc:297-313 145*1c12ee1eSDan Willemsenfunc EnumValueName(s string) string { 146*1c12ee1eSDan Willemsen var b []byte 147*1c12ee1eSDan Willemsen upperNext := true 148*1c12ee1eSDan Willemsen for _, c := range s { 149*1c12ee1eSDan Willemsen switch { 150*1c12ee1eSDan Willemsen case c == '_': 151*1c12ee1eSDan Willemsen upperNext = true 152*1c12ee1eSDan Willemsen case upperNext: 153*1c12ee1eSDan Willemsen b = append(b, byte(unicode.ToUpper(c))) 154*1c12ee1eSDan Willemsen upperNext = false 155*1c12ee1eSDan Willemsen default: 156*1c12ee1eSDan Willemsen b = append(b, byte(unicode.ToLower(c))) 157*1c12ee1eSDan Willemsen upperNext = false 158*1c12ee1eSDan Willemsen } 159*1c12ee1eSDan Willemsen } 160*1c12ee1eSDan Willemsen return string(b) 161*1c12ee1eSDan Willemsen} 162*1c12ee1eSDan Willemsen 163*1c12ee1eSDan Willemsen// TrimEnumPrefix trims the enum name prefix from an enum value name, 164*1c12ee1eSDan Willemsen// where the prefix is all lowercase without underscores. 165*1c12ee1eSDan Willemsen// See protoc v3.8.0: src/google/protobuf/descriptor.cc:330-375 166*1c12ee1eSDan Willemsenfunc TrimEnumPrefix(s, prefix string) string { 167*1c12ee1eSDan Willemsen s0 := s // original input 168*1c12ee1eSDan Willemsen for len(s) > 0 && len(prefix) > 0 { 169*1c12ee1eSDan Willemsen if s[0] == '_' { 170*1c12ee1eSDan Willemsen s = s[1:] 171*1c12ee1eSDan Willemsen continue 172*1c12ee1eSDan Willemsen } 173*1c12ee1eSDan Willemsen if unicode.ToLower(rune(s[0])) != rune(prefix[0]) { 174*1c12ee1eSDan Willemsen return s0 // no prefix match 175*1c12ee1eSDan Willemsen } 176*1c12ee1eSDan Willemsen s, prefix = s[1:], prefix[1:] 177*1c12ee1eSDan Willemsen } 178*1c12ee1eSDan Willemsen if len(prefix) > 0 { 179*1c12ee1eSDan Willemsen return s0 // no prefix match 180*1c12ee1eSDan Willemsen } 181*1c12ee1eSDan Willemsen s = strings.TrimLeft(s, "_") 182*1c12ee1eSDan Willemsen if len(s) == 0 { 183*1c12ee1eSDan Willemsen return s0 // avoid returning empty string 184*1c12ee1eSDan Willemsen } 185*1c12ee1eSDan Willemsen return s 186*1c12ee1eSDan Willemsen} 187*1c12ee1eSDan Willemsen 188*1c12ee1eSDan Willemsenfunc isASCIILower(c byte) bool { 189*1c12ee1eSDan Willemsen return 'a' <= c && c <= 'z' 190*1c12ee1eSDan Willemsen} 191*1c12ee1eSDan Willemsenfunc isASCIIUpper(c byte) bool { 192*1c12ee1eSDan Willemsen return 'A' <= c && c <= 'Z' 193*1c12ee1eSDan Willemsen} 194*1c12ee1eSDan Willemsenfunc isASCIIDigit(c byte) bool { 195*1c12ee1eSDan Willemsen return '0' <= c && c <= '9' 196*1c12ee1eSDan Willemsen} 197