xref: /aosp_15_r20/external/golang-protobuf/internal/strs/strings.go (revision 1c12ee1efe575feb122dbf939ff15148a3b3e8f2)
1*1c12ee1eSDan Willemsen// Copyright 2019 The Go Authors. All rights reserved.
2*1c12ee1eSDan Willemsen// Use of this source code is governed by a BSD-style
3*1c12ee1eSDan Willemsen// license that can be found in the LICENSE file.
4*1c12ee1eSDan Willemsen
5*1c12ee1eSDan Willemsen// Package strs provides string manipulation functionality specific to protobuf.
6*1c12ee1eSDan Willemsenpackage strs
7*1c12ee1eSDan Willemsen
8*1c12ee1eSDan Willemsenimport (
9*1c12ee1eSDan Willemsen	"go/token"
10*1c12ee1eSDan Willemsen	"strings"
11*1c12ee1eSDan Willemsen	"unicode"
12*1c12ee1eSDan Willemsen	"unicode/utf8"
13*1c12ee1eSDan Willemsen
14*1c12ee1eSDan Willemsen	"google.golang.org/protobuf/internal/flags"
15*1c12ee1eSDan Willemsen	"google.golang.org/protobuf/reflect/protoreflect"
16*1c12ee1eSDan Willemsen)
17*1c12ee1eSDan Willemsen
18*1c12ee1eSDan Willemsen// EnforceUTF8 reports whether to enforce strict UTF-8 validation.
19*1c12ee1eSDan Willemsenfunc EnforceUTF8(fd protoreflect.FieldDescriptor) bool {
20*1c12ee1eSDan Willemsen	if flags.ProtoLegacy {
21*1c12ee1eSDan Willemsen		if fd, ok := fd.(interface{ EnforceUTF8() bool }); ok {
22*1c12ee1eSDan Willemsen			return fd.EnforceUTF8()
23*1c12ee1eSDan Willemsen		}
24*1c12ee1eSDan Willemsen	}
25*1c12ee1eSDan Willemsen	return fd.Syntax() == protoreflect.Proto3
26*1c12ee1eSDan Willemsen}
27*1c12ee1eSDan Willemsen
28*1c12ee1eSDan Willemsen// GoCamelCase camel-cases a protobuf name for use as a Go identifier.
29*1c12ee1eSDan Willemsen//
30*1c12ee1eSDan Willemsen// If there is an interior underscore followed by a lower case letter,
31*1c12ee1eSDan Willemsen// drop the underscore and convert the letter to upper case.
32*1c12ee1eSDan Willemsenfunc GoCamelCase(s string) string {
33*1c12ee1eSDan Willemsen	// Invariant: if the next letter is lower case, it must be converted
34*1c12ee1eSDan Willemsen	// to upper case.
35*1c12ee1eSDan Willemsen	// That is, we process a word at a time, where words are marked by _ or
36*1c12ee1eSDan Willemsen	// upper case letter. Digits are treated as words.
37*1c12ee1eSDan Willemsen	var b []byte
38*1c12ee1eSDan Willemsen	for i := 0; i < len(s); i++ {
39*1c12ee1eSDan Willemsen		c := s[i]
40*1c12ee1eSDan Willemsen		switch {
41*1c12ee1eSDan Willemsen		case c == '.' && i+1 < len(s) && isASCIILower(s[i+1]):
42*1c12ee1eSDan Willemsen			// Skip over '.' in ".{{lowercase}}".
43*1c12ee1eSDan Willemsen		case c == '.':
44*1c12ee1eSDan Willemsen			b = append(b, '_') // convert '.' to '_'
45*1c12ee1eSDan Willemsen		case c == '_' && (i == 0 || s[i-1] == '.'):
46*1c12ee1eSDan Willemsen			// Convert initial '_' to ensure we start with a capital letter.
47*1c12ee1eSDan Willemsen			// Do the same for '_' after '.' to match historic behavior.
48*1c12ee1eSDan Willemsen			b = append(b, 'X') // convert '_' to 'X'
49*1c12ee1eSDan Willemsen		case c == '_' && i+1 < len(s) && isASCIILower(s[i+1]):
50*1c12ee1eSDan Willemsen			// Skip over '_' in "_{{lowercase}}".
51*1c12ee1eSDan Willemsen		case isASCIIDigit(c):
52*1c12ee1eSDan Willemsen			b = append(b, c)
53*1c12ee1eSDan Willemsen		default:
54*1c12ee1eSDan Willemsen			// Assume we have a letter now - if not, it's a bogus identifier.
55*1c12ee1eSDan Willemsen			// The next word is a sequence of characters that must start upper case.
56*1c12ee1eSDan Willemsen			if isASCIILower(c) {
57*1c12ee1eSDan Willemsen				c -= 'a' - 'A' // convert lowercase to uppercase
58*1c12ee1eSDan Willemsen			}
59*1c12ee1eSDan Willemsen			b = append(b, c)
60*1c12ee1eSDan Willemsen
61*1c12ee1eSDan Willemsen			// Accept lower case sequence that follows.
62*1c12ee1eSDan Willemsen			for ; i+1 < len(s) && isASCIILower(s[i+1]); i++ {
63*1c12ee1eSDan Willemsen				b = append(b, s[i+1])
64*1c12ee1eSDan Willemsen			}
65*1c12ee1eSDan Willemsen		}
66*1c12ee1eSDan Willemsen	}
67*1c12ee1eSDan Willemsen	return string(b)
68*1c12ee1eSDan Willemsen}
69*1c12ee1eSDan Willemsen
70*1c12ee1eSDan Willemsen// GoSanitized converts a string to a valid Go identifier.
71*1c12ee1eSDan Willemsenfunc GoSanitized(s string) string {
72*1c12ee1eSDan Willemsen	// Sanitize the input to the set of valid characters,
73*1c12ee1eSDan Willemsen	// which must be '_' or be in the Unicode L or N categories.
74*1c12ee1eSDan Willemsen	s = strings.Map(func(r rune) rune {
75*1c12ee1eSDan Willemsen		if unicode.IsLetter(r) || unicode.IsDigit(r) {
76*1c12ee1eSDan Willemsen			return r
77*1c12ee1eSDan Willemsen		}
78*1c12ee1eSDan Willemsen		return '_'
79*1c12ee1eSDan Willemsen	}, s)
80*1c12ee1eSDan Willemsen
81*1c12ee1eSDan Willemsen	// Prepend '_' in the event of a Go keyword conflict or if
82*1c12ee1eSDan Willemsen	// the identifier is invalid (does not start in the Unicode L category).
83*1c12ee1eSDan Willemsen	r, _ := utf8.DecodeRuneInString(s)
84*1c12ee1eSDan Willemsen	if token.Lookup(s).IsKeyword() || !unicode.IsLetter(r) {
85*1c12ee1eSDan Willemsen		return "_" + s
86*1c12ee1eSDan Willemsen	}
87*1c12ee1eSDan Willemsen	return s
88*1c12ee1eSDan Willemsen}
89*1c12ee1eSDan Willemsen
90*1c12ee1eSDan Willemsen// JSONCamelCase converts a snake_case identifier to a camelCase identifier,
91*1c12ee1eSDan Willemsen// according to the protobuf JSON specification.
92*1c12ee1eSDan Willemsenfunc JSONCamelCase(s string) string {
93*1c12ee1eSDan Willemsen	var b []byte
94*1c12ee1eSDan Willemsen	var wasUnderscore bool
95*1c12ee1eSDan Willemsen	for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
96*1c12ee1eSDan Willemsen		c := s[i]
97*1c12ee1eSDan Willemsen		if c != '_' {
98*1c12ee1eSDan Willemsen			if wasUnderscore && isASCIILower(c) {
99*1c12ee1eSDan Willemsen				c -= 'a' - 'A' // convert to uppercase
100*1c12ee1eSDan Willemsen			}
101*1c12ee1eSDan Willemsen			b = append(b, c)
102*1c12ee1eSDan Willemsen		}
103*1c12ee1eSDan Willemsen		wasUnderscore = c == '_'
104*1c12ee1eSDan Willemsen	}
105*1c12ee1eSDan Willemsen	return string(b)
106*1c12ee1eSDan Willemsen}
107*1c12ee1eSDan Willemsen
108*1c12ee1eSDan Willemsen// JSONSnakeCase converts a camelCase identifier to a snake_case identifier,
109*1c12ee1eSDan Willemsen// according to the protobuf JSON specification.
110*1c12ee1eSDan Willemsenfunc JSONSnakeCase(s string) string {
111*1c12ee1eSDan Willemsen	var b []byte
112*1c12ee1eSDan Willemsen	for i := 0; i < len(s); i++ { // proto identifiers are always ASCII
113*1c12ee1eSDan Willemsen		c := s[i]
114*1c12ee1eSDan Willemsen		if isASCIIUpper(c) {
115*1c12ee1eSDan Willemsen			b = append(b, '_')
116*1c12ee1eSDan Willemsen			c += 'a' - 'A' // convert to lowercase
117*1c12ee1eSDan Willemsen		}
118*1c12ee1eSDan Willemsen		b = append(b, c)
119*1c12ee1eSDan Willemsen	}
120*1c12ee1eSDan Willemsen	return string(b)
121*1c12ee1eSDan Willemsen}
122*1c12ee1eSDan Willemsen
123*1c12ee1eSDan Willemsen// MapEntryName derives the name of the map entry message given the field name.
124*1c12ee1eSDan Willemsen// See protoc v3.8.0: src/google/protobuf/descriptor.cc:254-276,6057
125*1c12ee1eSDan Willemsenfunc MapEntryName(s string) string {
126*1c12ee1eSDan Willemsen	var b []byte
127*1c12ee1eSDan Willemsen	upperNext := true
128*1c12ee1eSDan Willemsen	for _, c := range s {
129*1c12ee1eSDan Willemsen		switch {
130*1c12ee1eSDan Willemsen		case c == '_':
131*1c12ee1eSDan Willemsen			upperNext = true
132*1c12ee1eSDan Willemsen		case upperNext:
133*1c12ee1eSDan Willemsen			b = append(b, byte(unicode.ToUpper(c)))
134*1c12ee1eSDan Willemsen			upperNext = false
135*1c12ee1eSDan Willemsen		default:
136*1c12ee1eSDan Willemsen			b = append(b, byte(c))
137*1c12ee1eSDan Willemsen		}
138*1c12ee1eSDan Willemsen	}
139*1c12ee1eSDan Willemsen	b = append(b, "Entry"...)
140*1c12ee1eSDan Willemsen	return string(b)
141*1c12ee1eSDan Willemsen}
142*1c12ee1eSDan Willemsen
143*1c12ee1eSDan Willemsen// EnumValueName derives the camel-cased enum value name.
144*1c12ee1eSDan Willemsen// See protoc v3.8.0: src/google/protobuf/descriptor.cc:297-313
145*1c12ee1eSDan Willemsenfunc EnumValueName(s string) string {
146*1c12ee1eSDan Willemsen	var b []byte
147*1c12ee1eSDan Willemsen	upperNext := true
148*1c12ee1eSDan Willemsen	for _, c := range s {
149*1c12ee1eSDan Willemsen		switch {
150*1c12ee1eSDan Willemsen		case c == '_':
151*1c12ee1eSDan Willemsen			upperNext = true
152*1c12ee1eSDan Willemsen		case upperNext:
153*1c12ee1eSDan Willemsen			b = append(b, byte(unicode.ToUpper(c)))
154*1c12ee1eSDan Willemsen			upperNext = false
155*1c12ee1eSDan Willemsen		default:
156*1c12ee1eSDan Willemsen			b = append(b, byte(unicode.ToLower(c)))
157*1c12ee1eSDan Willemsen			upperNext = false
158*1c12ee1eSDan Willemsen		}
159*1c12ee1eSDan Willemsen	}
160*1c12ee1eSDan Willemsen	return string(b)
161*1c12ee1eSDan Willemsen}
162*1c12ee1eSDan Willemsen
163*1c12ee1eSDan Willemsen// TrimEnumPrefix trims the enum name prefix from an enum value name,
164*1c12ee1eSDan Willemsen// where the prefix is all lowercase without underscores.
165*1c12ee1eSDan Willemsen// See protoc v3.8.0: src/google/protobuf/descriptor.cc:330-375
166*1c12ee1eSDan Willemsenfunc TrimEnumPrefix(s, prefix string) string {
167*1c12ee1eSDan Willemsen	s0 := s // original input
168*1c12ee1eSDan Willemsen	for len(s) > 0 && len(prefix) > 0 {
169*1c12ee1eSDan Willemsen		if s[0] == '_' {
170*1c12ee1eSDan Willemsen			s = s[1:]
171*1c12ee1eSDan Willemsen			continue
172*1c12ee1eSDan Willemsen		}
173*1c12ee1eSDan Willemsen		if unicode.ToLower(rune(s[0])) != rune(prefix[0]) {
174*1c12ee1eSDan Willemsen			return s0 // no prefix match
175*1c12ee1eSDan Willemsen		}
176*1c12ee1eSDan Willemsen		s, prefix = s[1:], prefix[1:]
177*1c12ee1eSDan Willemsen	}
178*1c12ee1eSDan Willemsen	if len(prefix) > 0 {
179*1c12ee1eSDan Willemsen		return s0 // no prefix match
180*1c12ee1eSDan Willemsen	}
181*1c12ee1eSDan Willemsen	s = strings.TrimLeft(s, "_")
182*1c12ee1eSDan Willemsen	if len(s) == 0 {
183*1c12ee1eSDan Willemsen		return s0 // avoid returning empty string
184*1c12ee1eSDan Willemsen	}
185*1c12ee1eSDan Willemsen	return s
186*1c12ee1eSDan Willemsen}
187*1c12ee1eSDan Willemsen
188*1c12ee1eSDan Willemsenfunc isASCIILower(c byte) bool {
189*1c12ee1eSDan Willemsen	return 'a' <= c && c <= 'z'
190*1c12ee1eSDan Willemsen}
191*1c12ee1eSDan Willemsenfunc isASCIIUpper(c byte) bool {
192*1c12ee1eSDan Willemsen	return 'A' <= c && c <= 'Z'
193*1c12ee1eSDan Willemsen}
194*1c12ee1eSDan Willemsenfunc isASCIIDigit(c byte) bool {
195*1c12ee1eSDan Willemsen	return '0' <= c && c <= '9'
196*1c12ee1eSDan Willemsen}
197