readalignment.proto 9.0 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220
  1. // Copyright 2016 Google Inc.
  2. //
  3. // Licensed under the Apache License, Version 2.0 (the "License");
  4. // you may not use this file except in compliance with the License.
  5. // You may obtain a copy of the License at
  6. //
  7. // http://www.apache.org/licenses/LICENSE-2.0
  8. //
  9. // Unless required by applicable law or agreed to in writing, software
  10. // distributed under the License is distributed on an "AS IS" BASIS,
  11. // WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
  12. // See the License for the specific language governing permissions and
  13. // limitations under the License.
  14. syntax = "proto3";
  15. package google.genomics.v1;
  16. import "google/api/annotations.proto";
  17. import "google/genomics/v1/cigar.proto";
  18. import "google/genomics/v1/position.proto";
  19. import "google/protobuf/struct.proto";
  20. option cc_enable_arenas = true;
  21. option go_package = "google.golang.org/genproto/googleapis/genomics/v1;genomics";
  22. option java_multiple_files = true;
  23. option java_outer_classname = "ReadAlignmentProto";
  24. option java_package = "com.google.genomics.v1";
  25. // A linear alignment can be represented by one CIGAR string. Describes the
  26. // mapped position and local alignment of the read to the reference.
  27. message LinearAlignment {
  28. // The position of this alignment.
  29. Position position = 1;
  30. // The mapping quality of this alignment. Represents how likely
  31. // the read maps to this position as opposed to other locations.
  32. //
  33. // Specifically, this is -10 log10 Pr(mapping position is wrong), rounded to
  34. // the nearest integer.
  35. int32 mapping_quality = 2;
  36. // Represents the local alignment of this sequence (alignment matches, indels,
  37. // etc) against the reference.
  38. repeated CigarUnit cigar = 3;
  39. }
  40. // A read alignment describes a linear alignment of a string of DNA to a
  41. // [reference sequence][google.genomics.v1.Reference], in addition to metadata
  42. // about the fragment (the molecule of DNA sequenced) and the read (the bases
  43. // which were read by the sequencer). A read is equivalent to a line in a SAM
  44. // file. A read belongs to exactly one read group and exactly one
  45. // [read group set][google.genomics.v1.ReadGroupSet].
  46. //
  47. // For more genomics resource definitions, see [Fundamentals of Google
  48. // Genomics](https://cloud.google.com/genomics/fundamentals-of-google-genomics)
  49. //
  50. // ### Reverse-stranded reads
  51. //
  52. // Mapped reads (reads having a non-null `alignment`) can be aligned to either
  53. // the forward or the reverse strand of their associated reference. Strandedness
  54. // of a mapped read is encoded by `alignment.position.reverseStrand`.
  55. //
  56. // If we consider the reference to be a forward-stranded coordinate space of
  57. // `[0, reference.length)` with `0` as the left-most position and
  58. // `reference.length` as the right-most position, reads are always aligned left
  59. // to right. That is, `alignment.position.position` always refers to the
  60. // left-most reference coordinate and `alignment.cigar` describes the alignment
  61. // of this read to the reference from left to right. All per-base fields such as
  62. // `alignedSequence` and `alignedQuality` share this same left-to-right
  63. // orientation; this is true of reads which are aligned to either strand. For
  64. // reverse-stranded reads, this means that `alignedSequence` is the reverse
  65. // complement of the bases that were originally reported by the sequencing
  66. // machine.
  67. //
  68. // ### Generating a reference-aligned sequence string
  69. //
  70. // When interacting with mapped reads, it's often useful to produce a string
  71. // representing the local alignment of the read to reference. The following
  72. // pseudocode demonstrates one way of doing this:
  73. //
  74. // out = ""
  75. // offset = 0
  76. // for c in read.alignment.cigar {
  77. // switch c.operation {
  78. // case "ALIGNMENT_MATCH", "SEQUENCE_MATCH", "SEQUENCE_MISMATCH":
  79. // out += read.alignedSequence[offset:offset+c.operationLength]
  80. // offset += c.operationLength
  81. // break
  82. // case "CLIP_SOFT", "INSERT":
  83. // offset += c.operationLength
  84. // break
  85. // case "PAD":
  86. // out += repeat("*", c.operationLength)
  87. // break
  88. // case "DELETE":
  89. // out += repeat("-", c.operationLength)
  90. // break
  91. // case "SKIP":
  92. // out += repeat(" ", c.operationLength)
  93. // break
  94. // case "CLIP_HARD":
  95. // break
  96. // }
  97. // }
  98. // return out
  99. //
  100. // ### Converting to SAM's CIGAR string
  101. //
  102. // The following pseudocode generates a SAM CIGAR string from the
  103. // `cigar` field. Note that this is a lossy conversion
  104. // (`cigar.referenceSequence` is lost).
  105. //
  106. // cigarMap = {
  107. // "ALIGNMENT_MATCH": "M",
  108. // "INSERT": "I",
  109. // "DELETE": "D",
  110. // "SKIP": "N",
  111. // "CLIP_SOFT": "S",
  112. // "CLIP_HARD": "H",
  113. // "PAD": "P",
  114. // "SEQUENCE_MATCH": "=",
  115. // "SEQUENCE_MISMATCH": "X",
  116. // }
  117. // cigarStr = ""
  118. // for c in read.alignment.cigar {
  119. // cigarStr += c.operationLength + cigarMap[c.operation]
  120. // }
  121. // return cigarStr
  122. message Read {
  123. // The server-generated read ID, unique across all reads. This is different
  124. // from the `fragmentName`.
  125. string id = 1;
  126. // The ID of the read group this read belongs to. A read belongs to exactly
  127. // one read group. This is a server-generated ID which is distinct from SAM's
  128. // RG tag (for that value, see
  129. // [ReadGroup.name][google.genomics.v1.ReadGroup.name]).
  130. string read_group_id = 2;
  131. // The ID of the read group set this read belongs to. A read belongs to
  132. // exactly one read group set.
  133. string read_group_set_id = 3;
  134. // The fragment name. Equivalent to QNAME (query template name) in SAM.
  135. string fragment_name = 4;
  136. // The orientation and the distance between reads from the fragment are
  137. // consistent with the sequencing protocol (SAM flag 0x2).
  138. bool proper_placement = 5;
  139. // The fragment is a PCR or optical duplicate (SAM flag 0x400).
  140. bool duplicate_fragment = 6;
  141. // The observed length of the fragment, equivalent to TLEN in SAM.
  142. int32 fragment_length = 7;
  143. // The read number in sequencing. 0-based and less than numberReads. This
  144. // field replaces SAM flag 0x40 and 0x80.
  145. int32 read_number = 8;
  146. // The number of reads in the fragment (extension to SAM flag 0x1).
  147. int32 number_reads = 9;
  148. // Whether this read did not pass filters, such as platform or vendor quality
  149. // controls (SAM flag 0x200).
  150. bool failed_vendor_quality_checks = 10;
  151. // The linear alignment for this alignment record. This field is null for
  152. // unmapped reads.
  153. LinearAlignment alignment = 11;
  154. // Whether this alignment is secondary. Equivalent to SAM flag 0x100.
  155. // A secondary alignment represents an alternative to the primary alignment
  156. // for this read. Aligners may return secondary alignments if a read can map
  157. // ambiguously to multiple coordinates in the genome. By convention, each read
  158. // has one and only one alignment where both `secondaryAlignment`
  159. // and `supplementaryAlignment` are false.
  160. bool secondary_alignment = 12;
  161. // Whether this alignment is supplementary. Equivalent to SAM flag 0x800.
  162. // Supplementary alignments are used in the representation of a chimeric
  163. // alignment. In a chimeric alignment, a read is split into multiple
  164. // linear alignments that map to different reference contigs. The first
  165. // linear alignment in the read will be designated as the representative
  166. // alignment; the remaining linear alignments will be designated as
  167. // supplementary alignments. These alignments may have different mapping
  168. // quality scores. In each linear alignment in a chimeric alignment, the read
  169. // will be hard clipped. The `alignedSequence` and
  170. // `alignedQuality` fields in the alignment record will only
  171. // represent the bases for its respective linear alignment.
  172. bool supplementary_alignment = 13;
  173. // The bases of the read sequence contained in this alignment record,
  174. // **without CIGAR operations applied** (equivalent to SEQ in SAM).
  175. // `alignedSequence` and `alignedQuality` may be
  176. // shorter than the full read sequence and quality. This will occur if the
  177. // alignment is part of a chimeric alignment, or if the read was trimmed. When
  178. // this occurs, the CIGAR for this read will begin/end with a hard clip
  179. // operator that will indicate the length of the excised sequence.
  180. string aligned_sequence = 14;
  181. // The quality of the read sequence contained in this alignment record
  182. // (equivalent to QUAL in SAM).
  183. // `alignedSequence` and `alignedQuality` may be shorter than the full read
  184. // sequence and quality. This will occur if the alignment is part of a
  185. // chimeric alignment, or if the read was trimmed. When this occurs, the CIGAR
  186. // for this read will begin/end with a hard clip operator that will indicate
  187. // the length of the excised sequence.
  188. repeated int32 aligned_quality = 15;
  189. // The mapping of the primary alignment of the
  190. // `(readNumber+1)%numberReads` read in the fragment. It replaces
  191. // mate position and mate strand in SAM.
  192. Position next_mate_position = 16;
  193. // A map of additional read alignment information. This must be of the form
  194. // map<string, string[]> (string key mapping to a list of string values).
  195. map<string, google.protobuf.ListValue> info = 17;
  196. }