xref: /aosp_15_r20/external/googleapis/google/genomics/v1/reads.proto (revision d5c09012810ac0c9f33fe448fb6da8260d444cc9)
1// Copyright 2016 Google Inc.
2//
3// Licensed under the Apache License, Version 2.0 (the "License");
4// you may not use this file except in compliance with the License.
5// You may obtain a copy of the License at
6//
7//     http://www.apache.org/licenses/LICENSE-2.0
8//
9// Unless required by applicable law or agreed to in writing, software
10// distributed under the License is distributed on an "AS IS" BASIS,
11// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
12// See the License for the specific language governing permissions and
13// limitations under the License.
14
15syntax = "proto3";
16
17package google.genomics.v1;
18
19import "google/api/annotations.proto";
20import "google/genomics/v1/range.proto";
21import "google/genomics/v1/readalignment.proto";
22import "google/genomics/v1/readgroupset.proto";
23import "google/longrunning/operations.proto";
24import "google/protobuf/empty.proto";
25import "google/protobuf/field_mask.proto";
26
27option cc_enable_arenas = true;
28option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
29option java_multiple_files = true;
30option java_outer_classname = "ReadsProto";
31option java_package = "com.google.genomics.v1";
32
33service StreamingReadService {
34  // Returns a stream of all the reads matching the search request, ordered
35  // by reference name, position, and ID.
36  rpc StreamReads(StreamReadsRequest) returns (stream StreamReadsResponse) {
37    option (google.api.http) = {
38      post: "/v1/reads:stream"
39      body: "*"
40    };
41  }
42}
43
44// The Readstore. A data store for DNA sequencing Reads.
45service ReadServiceV1 {
46  // Creates read group sets by asynchronously importing the provided
47  // information.
48  //
49  // For the definitions of read group sets and other genomics resources, see
50  // [Fundamentals of Google
51  // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
52  //
53  // The caller must have WRITE permissions to the dataset.
54  //
55  // ## Notes on [BAM](https://samtools.github.io/hts-specs/SAMv1.pdf) import
56  //
57  // - Tags will be converted to strings - tag types are not preserved
58  // - Comments (`@CO`) in the input file header will not be preserved
59  // - Original header order of references (`@SQ`) will not be preserved
60  // - Any reverse stranded unmapped reads will be reverse complemented, and
61  // their qualities (also the "BQ" and "OQ" tags, if any) will be reversed
62  // - Unmapped reads will be stripped of positional information (reference name
63  // and position)
64  rpc ImportReadGroupSets(ImportReadGroupSetsRequest)
65      returns (google.longrunning.Operation) {
66    option (google.api.http) = {
67      post: "/v1/readgroupsets:import"
68      body: "*"
69    };
70  }
71
72  // Exports a read group set to a BAM file in Google Cloud Storage.
73  //
74  // For the definitions of read group sets and other genomics resources, see
75  // [Fundamentals of Google
76  // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
77  //
78  // Note that currently there may be some differences between exported BAM
79  // files and the original BAM file at the time of import. See
80  // [ImportReadGroupSets][google.genomics.v1.ReadServiceV1.ImportReadGroupSets]
81  // for caveats.
82  rpc ExportReadGroupSet(ExportReadGroupSetRequest)
83      returns (google.longrunning.Operation) {
84    option (google.api.http) = {
85      post: "/v1/readgroupsets/{read_group_set_id}:export"
86      body: "*"
87    };
88  }
89
90  // Searches for read group sets matching the criteria.
91  //
92  // For the definitions of read group sets and other genomics resources, see
93  // [Fundamentals of Google
94  // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
95  //
96  // Implements
97  // [GlobalAllianceApi.searchReadGroupSets](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/readmethods.avdl#L135).
98  rpc SearchReadGroupSets(SearchReadGroupSetsRequest)
99      returns (SearchReadGroupSetsResponse) {
100    option (google.api.http) = {
101      post: "/v1/readgroupsets/search"
102      body: "*"
103    };
104  }
105
106  // Updates a read group set.
107  //
108  // For the definitions of read group sets and other genomics resources, see
109  // [Fundamentals of Google
110  // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
111  //
112  // This method supports patch semantics.
113  rpc UpdateReadGroupSet(UpdateReadGroupSetRequest) returns (ReadGroupSet) {
114    option (google.api.http) = {
115      patch: "/v1/readgroupsets/{read_group_set_id}"
116      body: "read_group_set"
117    };
118  }
119
120  // Deletes a read group set.
121  //
122  // For the definitions of read group sets and other genomics resources, see
123  // [Fundamentals of Google
124  // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
125  rpc DeleteReadGroupSet(DeleteReadGroupSetRequest)
126      returns (google.protobuf.Empty) {
127    option (google.api.http) = {
128      delete: "/v1/readgroupsets/{read_group_set_id}"
129    };
130  }
131
132  // Gets a read group set by ID.
133  //
134  // For the definitions of read group sets and other genomics resources, see
135  // [Fundamentals of Google
136  // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
137  rpc GetReadGroupSet(GetReadGroupSetRequest) returns (ReadGroupSet) {
138    option (google.api.http) = {
139      get: "/v1/readgroupsets/{read_group_set_id}"
140    };
141  }
142
143  // Lists fixed width coverage buckets for a read group set, each of which
144  // correspond to a range of a reference sequence. Each bucket summarizes
145  // coverage information across its corresponding genomic range.
146  //
147  // For the definitions of read group sets and other genomics resources, see
148  // [Fundamentals of Google
149  // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
150  //
151  // Coverage is defined as the number of reads which are aligned to a given
152  // base in the reference sequence. Coverage buckets are available at several
153  // precomputed bucket widths, enabling retrieval of various coverage 'zoom
154  // levels'. The caller must have READ permissions for the target read group
155  // set.
156  rpc ListCoverageBuckets(ListCoverageBucketsRequest)
157      returns (ListCoverageBucketsResponse) {
158    option (google.api.http) = {
159      get: "/v1/readgroupsets/{read_group_set_id}/coveragebuckets"
160    };
161  }
162
163  // Gets a list of reads for one or more read group sets.
164  //
165  // For the definitions of read group sets and other genomics resources, see
166  // [Fundamentals of Google
167  // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
168  //
169  // Reads search operates over a genomic coordinate space of reference sequence
170  // & position defined over the reference sequences to which the requested
171  // read group sets are aligned.
172  //
173  // If a target positional range is specified, search returns all reads whose
174  // alignment to the reference genome overlap the range. A query which
175  // specifies only read group set IDs yields all reads in those read group
176  // sets, including unmapped reads.
177  //
178  // All reads returned (including reads on subsequent pages) are ordered by
179  // genomic coordinate (by reference sequence, then position). Reads with
180  // equivalent genomic coordinates are returned in an unspecified order. This
181  // order is consistent, such that two queries for the same content (regardless
182  // of page size) yield reads in the same order across their respective streams
183  // of paginated responses.
184  //
185  // Implements
186  // [GlobalAllianceApi.searchReads](https://github.com/ga4gh/schemas/blob/v0.5.1/src/main/resources/avro/readmethods.avdl#L85).
187  rpc SearchReads(SearchReadsRequest) returns (SearchReadsResponse) {
188    option (google.api.http) = {
189      post: "/v1/reads/search"
190      body: "*"
191    };
192  }
193}
194
195// The read group set search request.
196message SearchReadGroupSetsRequest {
197  // Restricts this query to read group sets within the given datasets. At least
198  // one ID must be provided.
199  repeated string dataset_ids = 1;
200
201  // Only return read group sets for which a substring of the name matches this
202  // string.
203  string name = 3;
204
205  // The continuation token, which is used to page through large result sets.
206  // To get the next page of results, set this parameter to the value of
207  // `nextPageToken` from the previous response.
208  string page_token = 2;
209
210  // The maximum number of results to return in a single page. If unspecified,
211  // defaults to 256. The maximum value is 1024.
212  int32 page_size = 4;
213}
214
215// The read group set search response.
216message SearchReadGroupSetsResponse {
217  // The list of matching read group sets.
218  repeated ReadGroupSet read_group_sets = 1;
219
220  // The continuation token, which is used to page through large result sets.
221  // Provide this value in a subsequent request to return the next page of
222  // results. This field will be empty if there aren't any additional results.
223  string next_page_token = 2;
224}
225
226// The read group set import request.
227message ImportReadGroupSetsRequest {
228  enum PartitionStrategy {
229    PARTITION_STRATEGY_UNSPECIFIED = 0;
230
231    // In most cases, this strategy yields one read group set per file. This is
232    // the default behavior.
233    //
234    // Allocate one read group set per file per sample. For BAM files, read
235    // groups are considered to share a sample if they have identical sample
236    // names. Furthermore, all reads for each file which do not belong to a read
237    // group, if any, will be grouped into a single read group set per-file.
238    PER_FILE_PER_SAMPLE = 1;
239
240    // Includes all read groups in all imported files into a single read group
241    // set. Requires that the headers for all imported files are equivalent. All
242    // reads which do not belong to a read group, if any, will be grouped into a
243    // separate read group set.
244    MERGE_ALL = 2;
245  }
246
247  // Required. The ID of the dataset these read group sets will belong to. The
248  // caller must have WRITE permissions to this dataset.
249  string dataset_id = 1;
250
251  // The reference set to which the imported read group sets are aligned to, if
252  // any. The reference names of this reference set must be a superset of those
253  // found in the imported file headers. If no reference set id is provided, a
254  // best effort is made to associate with a matching reference set.
255  string reference_set_id = 4;
256
257  // A list of URIs pointing at [BAM
258  // files](https://samtools.github.io/hts-specs/SAMv1.pdf)
259  // in Google Cloud Storage.
260  // Those URIs can include wildcards (*), but do not add or remove
261  // matching files before import has completed.
262  //
263  // Note that Google Cloud Storage object listing is only eventually
264  // consistent: files added may be not be immediately visible to
265  // everyone. Thus, if using a wildcard it is preferable not to start
266  // the import immediately after the files are created.
267  repeated string source_uris = 2;
268
269  // The partition strategy describes how read groups are partitioned into read
270  // group sets.
271  PartitionStrategy partition_strategy = 5;
272}
273
274// The read group set import response.
275message ImportReadGroupSetsResponse {
276  // IDs of the read group sets that were created.
277  repeated string read_group_set_ids = 1;
278}
279
280// The read group set export request.
281message ExportReadGroupSetRequest {
282  // Required. The Google Cloud project ID that owns this
283  // export. The caller must have WRITE access to this project.
284  string project_id = 1;
285
286  // Required. A Google Cloud Storage URI for the exported BAM file.
287  // The currently authenticated user must have write access to the new file.
288  // An error will be returned if the URI already contains data.
289  string export_uri = 2;
290
291  // Required. The ID of the read group set to export. The caller must have
292  // READ access to this read group set.
293  string read_group_set_id = 3;
294
295  // The reference names to export. If this is not specified, all reference
296  // sequences, including unmapped reads, are exported.
297  // Use `*` to export only unmapped reads.
298  repeated string reference_names = 4;
299}
300
301message UpdateReadGroupSetRequest {
302  // The ID of the read group set to be updated. The caller must have WRITE
303  // permissions to the dataset associated with this read group set.
304  string read_group_set_id = 1;
305
306  // The new read group set data. See `updateMask` for details on mutability of
307  // fields.
308  ReadGroupSet read_group_set = 2;
309
310  // An optional mask specifying which fields to update. Supported fields:
311  //
312  // * [name][google.genomics.v1.ReadGroupSet.name].
313  // * [referenceSetId][google.genomics.v1.ReadGroupSet.reference_set_id].
314  //
315  // Leaving `updateMask` unset is equivalent to specifying all mutable
316  // fields.
317  google.protobuf.FieldMask update_mask = 3;
318}
319
320message DeleteReadGroupSetRequest {
321  // The ID of the read group set to be deleted. The caller must have WRITE
322  // permissions to the dataset associated with this read group set.
323  string read_group_set_id = 1;
324}
325
326message GetReadGroupSetRequest {
327  // The ID of the read group set.
328  string read_group_set_id = 1;
329}
330
331message ListCoverageBucketsRequest {
332  // Required. The ID of the read group set over which coverage is requested.
333  string read_group_set_id = 1;
334
335  // The name of the reference to query, within the reference set associated
336  // with this query. Optional.
337  string reference_name = 3;
338
339  // The start position of the range on the reference, 0-based inclusive. If
340  // specified, `referenceName` must also be specified. Defaults to 0.
341  int64 start = 4;
342
343  // The end position of the range on the reference, 0-based exclusive. If
344  // specified, `referenceName` must also be specified. If unset or 0, defaults
345  // to the length of the reference.
346  int64 end = 5;
347
348  // The desired width of each reported coverage bucket in base pairs. This
349  // will be rounded down to the nearest precomputed bucket width; the value
350  // of which is returned as `bucketWidth` in the response. Defaults
351  // to infinity (each bucket spans an entire reference sequence) or the length
352  // of the target range, if specified. The smallest precomputed
353  // `bucketWidth` is currently 2048 base pairs; this is subject to
354  // change.
355  int64 target_bucket_width = 6;
356
357  // The continuation token, which is used to page through large result sets.
358  // To get the next page of results, set this parameter to the value of
359  // `nextPageToken` from the previous response.
360  string page_token = 7;
361
362  // The maximum number of results to return in a single page. If unspecified,
363  // defaults to 1024. The maximum value is 2048.
364  int32 page_size = 8;
365}
366
367// A bucket over which read coverage has been precomputed. A bucket corresponds
368// to a specific range of the reference sequence.
369message CoverageBucket {
370  // The genomic coordinate range spanned by this bucket.
371  Range range = 1;
372
373  // The average number of reads which are aligned to each individual
374  // reference base in this bucket.
375  float mean_coverage = 2;
376}
377
378message ListCoverageBucketsResponse {
379  // The length of each coverage bucket in base pairs. Note that buckets at the
380  // end of a reference sequence may be shorter. This value is omitted if the
381  // bucket width is infinity (the default behaviour, with no range or
382  // `targetBucketWidth`).
383  int64 bucket_width = 1;
384
385  // The coverage buckets. The list of buckets is sparse; a bucket with 0
386  // overlapping reads is not returned. A bucket never crosses more than one
387  // reference sequence. Each bucket has width `bucketWidth`, unless
388  // its end is the end of the reference sequence.
389  repeated CoverageBucket coverage_buckets = 2;
390
391  // The continuation token, which is used to page through large result sets.
392  // Provide this value in a subsequent request to return the next page of
393  // results. This field will be empty if there aren't any additional results.
394  string next_page_token = 3;
395}
396
397// The read search request.
398message SearchReadsRequest {
399  // The IDs of the read groups sets within which to search for reads. All
400  // specified read group sets must be aligned against a common set of reference
401  // sequences; this defines the genomic coordinates for the query. Must specify
402  // one of `readGroupSetIds` or `readGroupIds`.
403  repeated string read_group_set_ids = 1;
404
405  // The IDs of the read groups within which to search for reads. All specified
406  // read groups must belong to the same read group sets. Must specify one of
407  // `readGroupSetIds` or `readGroupIds`.
408  repeated string read_group_ids = 5;
409
410  // The reference sequence name, for example `chr1`, `1`, or `chrX`. If set to
411  // `*`, only unmapped reads are returned. If unspecified, all reads (mapped
412  // and unmapped) are returned.
413  string reference_name = 7;
414
415  // The start position of the range on the reference, 0-based inclusive. If
416  // specified, `referenceName` must also be specified.
417  int64 start = 8;
418
419  // The end position of the range on the reference, 0-based exclusive. If
420  // specified, `referenceName` must also be specified.
421  int64 end = 9;
422
423  // The continuation token, which is used to page through large result sets.
424  // To get the next page of results, set this parameter to the value of
425  // `nextPageToken` from the previous response.
426  string page_token = 3;
427
428  // The maximum number of results to return in a single page. If unspecified,
429  // defaults to 256. The maximum value is 2048.
430  int32 page_size = 4;
431}
432
433// The read search response.
434message SearchReadsResponse {
435  // The list of matching alignments sorted by mapped genomic coordinate,
436  // if any, ascending in position within the same reference. Unmapped reads,
437  // which have no position, are returned contiguously and are sorted in
438  // ascending lexicographic order by fragment name.
439  repeated Read alignments = 1;
440
441  // The continuation token, which is used to page through large result sets.
442  // Provide this value in a subsequent request to return the next page of
443  // results. This field will be empty if there aren't any additional results.
444  string next_page_token = 2;
445}
446
447// The stream reads request.
448message StreamReadsRequest {
449  // The Google Cloud project ID which will be billed
450  // for this access. The caller must have WRITE access to this project.
451  // Required.
452  string project_id = 1;
453
454  // The ID of the read group set from which to stream reads.
455  string read_group_set_id = 2;
456
457  // The reference sequence name, for example `chr1`,
458  // `1`, or `chrX`. If set to *, only unmapped reads are
459  // returned.
460  string reference_name = 3;
461
462  // The start position of the range on the reference, 0-based inclusive. If
463  // specified, `referenceName` must also be specified.
464  int64 start = 4;
465
466  // The end position of the range on the reference, 0-based exclusive. If
467  // specified, `referenceName` must also be specified.
468  int64 end = 5;
469
470  // Restricts results to a shard containing approximately `1/totalShards`
471  // of the normal response payload for this query. Results from a sharded
472  // request are disjoint from those returned by all queries which differ only
473  // in their shard parameter. A shard may yield 0 results; this is especially
474  // likely for large values of `totalShards`.
475  //
476  // Valid values are `[0, totalShards)`.
477  int32 shard = 6;
478
479  // Specifying `totalShards` causes a disjoint subset of the normal response
480  // payload to be returned for each query with a unique `shard` parameter
481  // specified. A best effort is made to yield equally sized shards. Sharding
482  // can be used to distribute processing amongst workers, where each worker is
483  // assigned a unique `shard` number and all workers specify the same
484  // `totalShards` number. The union of reads returned for all sharded queries
485  // `[0, totalShards)` is equal to those returned by a single unsharded query.
486  //
487  // Queries for different values of `totalShards` with common divisors will
488  // share shard boundaries. For example, streaming `shard` 2 of 5
489  // `totalShards` yields the same results as streaming `shard`s 4 and 5 of 10
490  // `totalShards`. This property can be leveraged for adaptive retries.
491  int32 total_shards = 7;
492}
493
494message StreamReadsResponse {
495  repeated Read alignments = 1;
496}
497