xref: /aosp_15_r20/external/licenseclassifier/commentparser/language/language.go (revision 46c4c49da23cae783fa41bf46525a6505638499a)
1// Copyright 2017 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15// Package language contains methods and information about the different
16// programming languages the comment parser supports.
17package language
18
19import (
20	"path/filepath"
21	"strings"
22)
23
24// Language is the progamming language we're grabbing the comments from.
25type Language int
26
27// Languages we can retrieve comments from.
28const (
29	Unknown Language = iota
30	AppleScript
31	Assembly
32	BLIF // Berkley Logic Interface Format
33	Batch
34	C
35	Clif
36	Clojure
37	CMake
38	CSharp
39	Dart
40	EDIF // Electronic Design Interchange Format
41	Elixir
42	Flex
43	Fortran
44	GLSLF // OpenGL Shading Language
45	Go
46	HTML
47	Haskell
48	Java
49	JavaScript
50	Kotlin
51	LEF // Library Exchange Format
52	Lisp
53	Markdown
54	Matlab
55	MySQL
56	NinjaBuild
57	ObjectiveC
58	Perl
59	Python
60	R
61	Ruby
62	Rust
63	SDC  // Synopsis Design Constraint
64	SDF  // Standard Delay Format
65	SPEF // Standard Parasitics Exchange Format
66	SQL
67	SWIG
68	Shader
69	Shell
70	Swift
71	SystemVerilog
72	TCL
73	TypeScript
74	Verilog
75	XDC // Xilinx Design Constraint files
76	Yacc
77	Yaml
78)
79
80// style is the comment styles that a language uses.
81type style int
82
83// Comment styles.
84const (
85	unknown     style = iota
86	applescript       // -- ... and (* ... *)
87	batch             // @REM
88	bcpl              // // ... and /* ... */
89	cmake             // # ... and #[[ ... ]]
90	fortran           // ! ...
91	hash              // # ...
92	haskell           // -- ... and {- ... -}
93	html              // <!-- ... -->
94	lisp              // ;; ...
95	matlab            // % ...
96	mysql             // # ... and /* ... */
97	ruby              // # ... and =begin ... =end
98	shell             // # ... and %{ ... %}
99	sql               // -- ... and /* ... */
100)
101
102// ClassifyLanguage determines what language the source code was written in. It
103// does this by looking at the file's extension.
104func ClassifyLanguage(filename string) Language {
105	ext := strings.ToLower(filepath.Ext(filename))
106	if len(ext) == 0 || ext[0] != '.' {
107		return Unknown
108	}
109
110	switch ext[1:] { // Skip the '.'.
111	case "applescript":
112		return AppleScript
113	case "bat":
114		return Batch
115	case "blif", "eblif":
116		return BLIF
117	case "c", "cc", "cpp", "c++", "h", "hh", "hpp":
118		return C
119	case "clif":
120		return Clif
121	case "cmake":
122		return CMake
123	case "cs":
124		return CSharp
125	case "dart":
126		return Dart
127	case "ex", "exs":
128		return Elixir
129	case "f", "f90", "f95":
130		return Fortran
131	case "glslf":
132		return GLSLF
133	case "go":
134		return Go
135	case "hs":
136		return Haskell
137	case "html", "htm", "ng", "sgml":
138		return HTML
139	case "java":
140		return Java
141	case "js":
142		return JavaScript
143	case "kt":
144		return Kotlin
145	case "l":
146		return Flex
147	case "lef":
148		return LEF
149	case "lisp", "el", "clj":
150		return Lisp
151	case "m", "mm":
152		return ObjectiveC
153	case "md":
154		return Markdown
155	case "gn":
156		return NinjaBuild
157	case "pl", "pm":
158		return Perl
159	case "py", "pi":
160		return Python
161	case "r":
162		return R
163	case "rb":
164		return Ruby
165	case "rs":
166		return Rust
167	case "s":
168		return Assembly
169	case "sdf":
170		return SDF
171	case "sh":
172		return Shell
173	case "shader":
174		return Shader
175	case "sql":
176		return SQL
177	case "swift":
178		return Swift
179	case "swig":
180		return SWIG
181	case "sv", "svh":
182		return SystemVerilog
183	case "tcl", "sdc", "xdc":
184		return TCL
185	case "ts", "tsx":
186		return TypeScript
187	case "v", "vh":
188		return Verilog
189	case "y":
190		return Yacc
191	case "yaml":
192		return Yaml
193	}
194	return Unknown
195}
196
197// commentStyle returns the language's comment style.
198func (lang Language) commentStyle() style {
199	switch lang {
200	case Assembly, C, CSharp, Dart, Flex, GLSLF, Go, Java, JavaScript, Kotlin, ObjectiveC, Rust, Shader, Swift, SWIG, TypeScript, Yacc, Verilog, SystemVerilog, SDF, SPEF:
201		return bcpl
202	case Batch:
203		return batch
204	case BLIF, TCL:
205		return hash
206	case CMake:
207		return cmake
208	case Fortran:
209		return fortran
210	case Haskell:
211		return haskell
212	case HTML, Markdown:
213		return html
214	case Clojure, Lisp:
215		return lisp
216	case Ruby:
217		return ruby
218	case Clif, Elixir, NinjaBuild, Perl, Python, R, Shell, Yaml:
219		return shell
220	case Matlab:
221		return matlab
222	case MySQL:
223		return mysql
224	case SQL:
225		return sql
226	}
227	return unknown
228}
229
230// SingleLineCommentStart returns the starting string of a single line comment
231// for the given language. There is no equivalent "End" method, because it's
232// the end of line.
233func (lang Language) SingleLineCommentStart() string {
234	switch lang.commentStyle() {
235	case applescript, haskell, sql:
236		return "--"
237	case batch:
238		return "@REM"
239	case bcpl:
240		return "//"
241	case fortran:
242		return "!"
243	case lisp:
244		return ";"
245	case matlab:
246		return "%"
247	case shell, ruby, cmake, mysql, hash:
248		return "#"
249	}
250	return ""
251}
252
253// MultilineCommentStart returns the starting string of a multiline comment for
254// the given language.
255func (lang Language) MultilineCommentStart() string {
256	switch lang.commentStyle() {
257	case applescript:
258		return "(*"
259	case bcpl, mysql:
260		if lang != Rust {
261			return "/*"
262		}
263	case cmake:
264		return "#[["
265	case haskell:
266		return "{-"
267	case html:
268		return "<!--"
269	case matlab:
270		return "%{"
271	case ruby:
272		return "=begin"
273	}
274	return ""
275}
276
277// MultilineCommentEnd returns the ending string of a multiline comment for the
278// given language.
279func (lang Language) MultilineCommentEnd() string {
280	switch lang.commentStyle() {
281	case applescript:
282		return "*)"
283	case bcpl, mysql:
284		if lang != Rust {
285			return "*/"
286		}
287	case cmake:
288		return "]]"
289	case haskell:
290		return "-}"
291	case html:
292		return "-->"
293	case matlab:
294		return "%}"
295	case ruby:
296		return "=end"
297	}
298	return ""
299}
300
301// QuoteCharacter returns 'true' if the character is considered the beginning
302// of a string in the given language. The second return value is true if the
303// string allows for escaping.
304func (lang Language) QuoteCharacter(quote rune) (ok bool, escape bool) {
305	switch quote {
306	case '"', '\'':
307		return true, true
308	case '`':
309		if lang == Go {
310			return true, false
311		}
312	}
313	return false, false
314}
315
316// NestedComments returns true if the language allows for nested multiline comments.
317func (lang Language) NestedComments() bool {
318	return lang == Swift
319}
320