1*a58d3d2aSXin Li<?xml version="1.0" encoding="utf-8"?> 2*a58d3d2aSXin Li<!-- 3*a58d3d2aSXin Li Copyright (c) 2012-2016 Xiph.Org Foundation and contributors 4*a58d3d2aSXin Li 5*a58d3d2aSXin Li Redistribution and use in source and binary forms, with or without 6*a58d3d2aSXin Li modification, are permitted provided that the following conditions 7*a58d3d2aSXin Li are met: 8*a58d3d2aSXin Li 9*a58d3d2aSXin Li - Redistributions of source code must retain the above copyright 10*a58d3d2aSXin Li notice, this list of conditions and the following disclaimer. 11*a58d3d2aSXin Li 12*a58d3d2aSXin Li - Redistributions in binary form must reproduce the above copyright 13*a58d3d2aSXin Li notice, this list of conditions and the following disclaimer in the 14*a58d3d2aSXin Li documentation and/or other materials provided with the distribution. 15*a58d3d2aSXin Li 16*a58d3d2aSXin Li THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS 17*a58d3d2aSXin Li ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT 18*a58d3d2aSXin Li LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR 19*a58d3d2aSXin Li A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER 20*a58d3d2aSXin Li OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, 21*a58d3d2aSXin Li EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, 22*a58d3d2aSXin Li PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR 23*a58d3d2aSXin Li PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF 24*a58d3d2aSXin Li LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING 25*a58d3d2aSXin Li NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS 26*a58d3d2aSXin Li SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. 27*a58d3d2aSXin Li 28*a58d3d2aSXin Li Special permission is granted to remove the above copyright notice, list of 29*a58d3d2aSXin Li conditions, and disclaimer when submitting this document, with or without 30*a58d3d2aSXin Li modification, to the IETF. 31*a58d3d2aSXin Li--> 32*a58d3d2aSXin Li<!DOCTYPE rfc SYSTEM 'rfc2629.dtd' [ 33*a58d3d2aSXin Li<!ENTITY rfc2119 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml'> 34*a58d3d2aSXin Li<!ENTITY rfc3533 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3533.xml'> 35*a58d3d2aSXin Li<!ENTITY rfc3629 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3629.xml'> 36*a58d3d2aSXin Li<!ENTITY rfc4732 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4732.xml'> 37*a58d3d2aSXin Li<!ENTITY rfc5226 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5226.xml'> 38*a58d3d2aSXin Li<!ENTITY rfc5334 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5334.xml'> 39*a58d3d2aSXin Li<!ENTITY rfc6381 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6381.xml'> 40*a58d3d2aSXin Li<!ENTITY rfc6716 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6716.xml'> 41*a58d3d2aSXin Li<!ENTITY rfc6982 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6982.xml'> 42*a58d3d2aSXin Li<!ENTITY rfc7587 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.7587.xml'> 43*a58d3d2aSXin Li]> 44*a58d3d2aSXin Li<?rfc toc="yes" symrefs="yes" ?> 45*a58d3d2aSXin Li 46*a58d3d2aSXin Li<rfc ipr="trust200902" category="std" docName="draft-ietf-codec-oggopus-14" 47*a58d3d2aSXin Li updates="5334"> 48*a58d3d2aSXin Li 49*a58d3d2aSXin Li<front> 50*a58d3d2aSXin Li<title abbrev="Ogg Opus">Ogg Encapsulation for the Opus Audio Codec</title> 51*a58d3d2aSXin Li<author initials="T.B." surname="Terriberry" fullname="Timothy B. Terriberry"> 52*a58d3d2aSXin Li<organization>Mozilla Corporation</organization> 53*a58d3d2aSXin Li<address> 54*a58d3d2aSXin Li<postal> 55*a58d3d2aSXin Li<street>650 Castro Street</street> 56*a58d3d2aSXin Li<city>Mountain View</city> 57*a58d3d2aSXin Li<region>CA</region> 58*a58d3d2aSXin Li<code>94041</code> 59*a58d3d2aSXin Li<country>USA</country> 60*a58d3d2aSXin Li</postal> 61*a58d3d2aSXin Li<phone>+1 650 903-0800</phone> 62*a58d3d2aSXin Li<email>[email protected]</email> 63*a58d3d2aSXin Li</address> 64*a58d3d2aSXin Li</author> 65*a58d3d2aSXin Li 66*a58d3d2aSXin Li<author initials="R." surname="Lee" fullname="Ron Lee"> 67*a58d3d2aSXin Li<organization>Voicetronix</organization> 68*a58d3d2aSXin Li<address> 69*a58d3d2aSXin Li<postal> 70*a58d3d2aSXin Li<street>246 Pulteney Street, Level 1</street> 71*a58d3d2aSXin Li<city>Adelaide</city> 72*a58d3d2aSXin Li<region>SA</region> 73*a58d3d2aSXin Li<code>5000</code> 74*a58d3d2aSXin Li<country>Australia</country> 75*a58d3d2aSXin Li</postal> 76*a58d3d2aSXin Li<phone>+61 8 8232 9112</phone> 77*a58d3d2aSXin Li<email>[email protected]</email> 78*a58d3d2aSXin Li</address> 79*a58d3d2aSXin Li</author> 80*a58d3d2aSXin Li 81*a58d3d2aSXin Li<author initials="R." surname="Giles" fullname="Ralph Giles"> 82*a58d3d2aSXin Li<organization>Mozilla Corporation</organization> 83*a58d3d2aSXin Li<address> 84*a58d3d2aSXin Li<postal> 85*a58d3d2aSXin Li<street>163 West Hastings Street</street> 86*a58d3d2aSXin Li<city>Vancouver</city> 87*a58d3d2aSXin Li<region>BC</region> 88*a58d3d2aSXin Li<code>V6B 1H5</code> 89*a58d3d2aSXin Li<country>Canada</country> 90*a58d3d2aSXin Li</postal> 91*a58d3d2aSXin Li<phone>+1 778 785 1540</phone> 92*a58d3d2aSXin Li<email>[email protected]</email> 93*a58d3d2aSXin Li</address> 94*a58d3d2aSXin Li</author> 95*a58d3d2aSXin Li 96*a58d3d2aSXin Li<date day="22" month="February" year="2016"/> 97*a58d3d2aSXin Li<area>RAI</area> 98*a58d3d2aSXin Li<workgroup>codec</workgroup> 99*a58d3d2aSXin Li 100*a58d3d2aSXin Li<abstract> 101*a58d3d2aSXin Li<t> 102*a58d3d2aSXin LiThis document defines the Ogg encapsulation for the Opus interactive speech and 103*a58d3d2aSXin Li audio codec. 104*a58d3d2aSXin LiThis allows data encoded in the Opus format to be stored in an Ogg logical 105*a58d3d2aSXin Li bitstream. 106*a58d3d2aSXin Li</t> 107*a58d3d2aSXin Li</abstract> 108*a58d3d2aSXin Li</front> 109*a58d3d2aSXin Li 110*a58d3d2aSXin Li<middle> 111*a58d3d2aSXin Li<section anchor="intro" title="Introduction"> 112*a58d3d2aSXin Li<t> 113*a58d3d2aSXin LiThe IETF Opus codec is a low-latency audio codec optimized for both voice and 114*a58d3d2aSXin Li general-purpose audio. 115*a58d3d2aSXin LiSee <xref target="RFC6716"/> for technical details. 116*a58d3d2aSXin LiThis document defines the encapsulation of Opus in a continuous, logical Ogg 117*a58d3d2aSXin Li bitstream <xref target="RFC3533"/>. 118*a58d3d2aSXin LiOgg encapsulation provides Opus with a long-term storage format supporting 119*a58d3d2aSXin Li all of the essential features, including metadata, fast and accurate seeking, 120*a58d3d2aSXin Li corruption detection, recapture after errors, low overhead, and the ability to 121*a58d3d2aSXin Li multiplex Opus with other codecs (including video) with minimal buffering. 122*a58d3d2aSXin LiIt also provides a live streamable format, capable of delivery over a reliable 123*a58d3d2aSXin Li stream-oriented transport, without requiring all the data, or even the total 124*a58d3d2aSXin Li length of the data, up-front, in a form that is identical to the on-disk 125*a58d3d2aSXin Li storage format. 126*a58d3d2aSXin Li</t> 127*a58d3d2aSXin Li<t> 128*a58d3d2aSXin LiOgg bitstreams are made up of a series of 'pages', each of which contains data 129*a58d3d2aSXin Li from one or more 'packets'. 130*a58d3d2aSXin LiPages are the fundamental unit of multiplexing in an Ogg stream. 131*a58d3d2aSXin LiEach page is associated with a particular logical stream and contains a capture 132*a58d3d2aSXin Li pattern and checksum, flags to mark the beginning and end of the logical 133*a58d3d2aSXin Li stream, and a 'granule position' that represents an absolute position in the 134*a58d3d2aSXin Li stream, to aid seeking. 135*a58d3d2aSXin LiA single page can contain up to 65,025 octets of packet data from up to 255 136*a58d3d2aSXin Li different packets. 137*a58d3d2aSXin LiPackets can be split arbitrarily across pages, and continued from one page to 138*a58d3d2aSXin Li the next (allowing packets much larger than would fit on a single page). 139*a58d3d2aSXin LiEach page contains 'lacing values' that indicate how the data is partitioned 140*a58d3d2aSXin Li into packets, allowing a demultiplexer (demuxer) to recover the packet 141*a58d3d2aSXin Li boundaries without examining the encoded data. 142*a58d3d2aSXin LiA packet is said to 'complete' on a page when the page contains the final 143*a58d3d2aSXin Li lacing value corresponding to that packet. 144*a58d3d2aSXin Li</t> 145*a58d3d2aSXin Li<t> 146*a58d3d2aSXin LiThis encapsulation defines the contents of the packet data, including 147*a58d3d2aSXin Li the necessary headers, the organization of those packets into a logical 148*a58d3d2aSXin Li stream, and the interpretation of the codec-specific granule position field. 149*a58d3d2aSXin LiIt does not attempt to describe or specify the existing Ogg container format. 150*a58d3d2aSXin LiReaders unfamiliar with the basic concepts mentioned above are encouraged to 151*a58d3d2aSXin Li review the details in <xref target="RFC3533"/>. 152*a58d3d2aSXin Li</t> 153*a58d3d2aSXin Li 154*a58d3d2aSXin Li</section> 155*a58d3d2aSXin Li 156*a58d3d2aSXin Li<section anchor="terminology" title="Terminology"> 157*a58d3d2aSXin Li<t> 158*a58d3d2aSXin LiThe key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD", 159*a58d3d2aSXin Li "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this 160*a58d3d2aSXin Li document are to be interpreted as described in <xref target="RFC2119"/>. 161*a58d3d2aSXin Li</t> 162*a58d3d2aSXin Li 163*a58d3d2aSXin Li</section> 164*a58d3d2aSXin Li 165*a58d3d2aSXin Li<section anchor="packet_organization" title="Packet Organization"> 166*a58d3d2aSXin Li<t> 167*a58d3d2aSXin LiAn Ogg Opus stream is organized as follows (see 168*a58d3d2aSXin Li <xref target="packet-org-example"/> for an example). 169*a58d3d2aSXin Li</t> 170*a58d3d2aSXin Li 171*a58d3d2aSXin Li<figure anchor="packet-org-example" 172*a58d3d2aSXin Li title="Example packet organization for a logical Ogg Opus stream" 173*a58d3d2aSXin Li align="center"> 174*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 175*a58d3d2aSXin Li Page 0 Pages 1 ... n Pages (n+1) ... 176*a58d3d2aSXin Li +------------+ +---+ +---+ ... +---+ +-----------+ +---------+ +-- 177*a58d3d2aSXin Li | | | | | | | | | | | | | 178*a58d3d2aSXin Li |+----------+| |+-----------------+| |+-------------------+ +----- 179*a58d3d2aSXin Li |||ID Header|| || Comment Header || ||Audio Data Packet 1| | ... 180*a58d3d2aSXin Li |+----------+| |+-----------------+| |+-------------------+ +----- 181*a58d3d2aSXin Li | | | | | | | | | | | | | 182*a58d3d2aSXin Li +------------+ +---+ +---+ ... +---+ +-----------+ +---------+ +-- 183*a58d3d2aSXin Li ^ ^ ^ 184*a58d3d2aSXin Li | | | 185*a58d3d2aSXin Li | | Mandatory Page Break 186*a58d3d2aSXin Li | | 187*a58d3d2aSXin Li | ID header is contained on a single page 188*a58d3d2aSXin Li | 189*a58d3d2aSXin Li 'Beginning Of Stream' 190*a58d3d2aSXin Li]]></artwork> 191*a58d3d2aSXin Li</figure> 192*a58d3d2aSXin Li 193*a58d3d2aSXin Li<t> 194*a58d3d2aSXin LiThere are two mandatory header packets. 195*a58d3d2aSXin LiThe first packet in the logical Ogg bitstream MUST contain the identification 196*a58d3d2aSXin Li (ID) header, which uniquely identifies a stream as Opus audio. 197*a58d3d2aSXin LiThe format of this header is defined in <xref target="id_header"/>. 198*a58d3d2aSXin LiIt is placed alone (without any other packet data) on the first page of 199*a58d3d2aSXin Li the logical Ogg bitstream, and completes on that page. 200*a58d3d2aSXin LiThis page has its 'beginning of stream' flag set. 201*a58d3d2aSXin Li</t> 202*a58d3d2aSXin Li<t> 203*a58d3d2aSXin LiThe second packet in the logical Ogg bitstream MUST contain the comment header, 204*a58d3d2aSXin Li which contains user-supplied metadata. 205*a58d3d2aSXin LiThe format of this header is defined in <xref target="comment_header"/>. 206*a58d3d2aSXin LiIt MAY span multiple pages, beginning on the second page of the logical 207*a58d3d2aSXin Li stream. 208*a58d3d2aSXin LiHowever many pages it spans, the comment header packet MUST finish the page on 209*a58d3d2aSXin Li which it completes. 210*a58d3d2aSXin Li</t> 211*a58d3d2aSXin Li<t> 212*a58d3d2aSXin LiAll subsequent pages are audio data pages, and the Ogg packets they contain are 213*a58d3d2aSXin Li audio data packets. 214*a58d3d2aSXin LiEach audio data packet contains one Opus packet for each of N different 215*a58d3d2aSXin Li streams, where N is typically one for mono or stereo, but MAY be greater than 216*a58d3d2aSXin Li one for multichannel audio. 217*a58d3d2aSXin LiThe value N is specified in the ID header (see 218*a58d3d2aSXin Li <xref target="channel_mapping"/>), and is fixed over the entire length of the 219*a58d3d2aSXin Li logical Ogg bitstream. 220*a58d3d2aSXin Li</t> 221*a58d3d2aSXin Li<t> 222*a58d3d2aSXin LiThe first (N - 1) Opus packets, if any, are packed one after another 223*a58d3d2aSXin Li into the Ogg packet, using the self-delimiting framing from Appendix B of 224*a58d3d2aSXin Li <xref target="RFC6716"/>. 225*a58d3d2aSXin LiThe remaining Opus packet is packed at the end of the Ogg packet using the 226*a58d3d2aSXin Li regular, undelimited framing from Section 3 of <xref target="RFC6716"/>. 227*a58d3d2aSXin LiAll of the Opus packets in a single Ogg packet MUST be constrained to have the 228*a58d3d2aSXin Li same duration. 229*a58d3d2aSXin LiAn implementation of this specification SHOULD treat any Opus packet whose 230*a58d3d2aSXin Li duration is different from that of the first Opus packet in an Ogg packet as 231*a58d3d2aSXin Li if it were a malformed Opus packet with an invalid Table Of Contents (TOC) 232*a58d3d2aSXin Li sequence. 233*a58d3d2aSXin Li</t> 234*a58d3d2aSXin Li<t> 235*a58d3d2aSXin LiThe TOC sequence at the beginning of each Opus packet indicates the coding 236*a58d3d2aSXin Li mode, audio bandwidth, channel count, duration (frame size), and number of 237*a58d3d2aSXin Li frames per packet, as described in Section 3.1 238*a58d3d2aSXin Li of <xref target="RFC6716"/>. 239*a58d3d2aSXin LiThe coding mode is one of SILK, Hybrid, or Constrained Energy Lapped Transform 240*a58d3d2aSXin Li (CELT). 241*a58d3d2aSXin LiThe combination of coding mode, audio bandwidth, and frame size is referred to 242*a58d3d2aSXin Li as the configuration of an Opus packet. 243*a58d3d2aSXin Li</t> 244*a58d3d2aSXin Li<t> 245*a58d3d2aSXin LiPackets are placed into Ogg pages in order until the end of stream. 246*a58d3d2aSXin LiAudio data packets might span page boundaries. 247*a58d3d2aSXin LiThe first audio data page could have the 'continued packet' flag set 248*a58d3d2aSXin Li (indicating the first audio data packet is continued from a previous page) if, 249*a58d3d2aSXin Li for example, it was a live stream joined mid-broadcast, with the headers 250*a58d3d2aSXin Li pasted on the front. 251*a58d3d2aSXin LiIf a page has the 'continued packet' flag set and one of the following 252*a58d3d2aSXin Li conditions is also true: 253*a58d3d2aSXin Li<list style="symbols"> 254*a58d3d2aSXin Li<t>the previous page with packet data does not end in a continued packet (does 255*a58d3d2aSXin Li not end with a lacing value of 255) OR</t> 256*a58d3d2aSXin Li<t>the page sequence numbers are not consecutive,</t> 257*a58d3d2aSXin Li</list> 258*a58d3d2aSXin Li then a demuxer MUST NOT attempt to decode the data for the first packet on the 259*a58d3d2aSXin Li page unless the demuxer has some special knowledge that would allow it to 260*a58d3d2aSXin Li interpret this data despite the missing pieces. 261*a58d3d2aSXin LiAn implementation MUST treat a zero-octet audio data packet as if it were a 262*a58d3d2aSXin Li malformed Opus packet as described in 263*a58d3d2aSXin Li Section 3.4 of <xref target="RFC6716"/>. 264*a58d3d2aSXin Li</t> 265*a58d3d2aSXin Li<t> 266*a58d3d2aSXin LiA logical stream ends with a page with the 'end of stream' flag set, but 267*a58d3d2aSXin Li implementations need to be prepared to deal with truncated streams that do not 268*a58d3d2aSXin Li have a page marked 'end of stream'. 269*a58d3d2aSXin LiThere is no reason for the final packet on the last page to be a continued 270*a58d3d2aSXin Li packet, i.e., for the final lacing value to be 255. 271*a58d3d2aSXin LiHowever, demuxers might encounter such streams, possibly as the result of a 272*a58d3d2aSXin Li transfer that did not complete or of corruption. 273*a58d3d2aSXin LiIf a packet continues onto a subsequent page (i.e., when the page ends with a 274*a58d3d2aSXin Li lacing value of 255) and one of the following conditions is also true: 275*a58d3d2aSXin Li<list style="symbols"> 276*a58d3d2aSXin Li<t>the next page with packet data does not have the 'continued packet' flag 277*a58d3d2aSXin Li set OR</t> 278*a58d3d2aSXin Li<t>there is no next page with packet data OR</t> 279*a58d3d2aSXin Li<t>the page sequence numbers are not consecutive,</t> 280*a58d3d2aSXin Li</list> 281*a58d3d2aSXin Li then a demuxer MUST NOT attempt to decode the data from that packet unless the 282*a58d3d2aSXin Li demuxer has some special knowledge that would allow it to interpret this data 283*a58d3d2aSXin Li despite the missing pieces. 284*a58d3d2aSXin LiThere MUST NOT be any more pages in an Opus logical bitstream after a page 285*a58d3d2aSXin Li marked 'end of stream'. 286*a58d3d2aSXin Li</t> 287*a58d3d2aSXin Li</section> 288*a58d3d2aSXin Li 289*a58d3d2aSXin Li<section anchor="granpos" title="Granule Position"> 290*a58d3d2aSXin Li<t> 291*a58d3d2aSXin LiThe granule position MUST be zero for the ID header page and the 292*a58d3d2aSXin Li page where the comment header completes. 293*a58d3d2aSXin LiThat is, the first page in the logical stream, and the last header 294*a58d3d2aSXin Li page before the first audio data page both have a granule position of zero. 295*a58d3d2aSXin Li</t> 296*a58d3d2aSXin Li<t> 297*a58d3d2aSXin LiThe granule position of an audio data page encodes the total number of PCM 298*a58d3d2aSXin Li samples in the stream up to and including the last fully-decodable sample from 299*a58d3d2aSXin Li the last packet completed on that page. 300*a58d3d2aSXin LiThe granule position of the first audio data page will usually be larger than 301*a58d3d2aSXin Li zero, as described in <xref target="start_granpos_restrictions"/>. 302*a58d3d2aSXin Li</t> 303*a58d3d2aSXin Li 304*a58d3d2aSXin Li<t> 305*a58d3d2aSXin LiA page that is entirely spanned by a single packet (that completes on a 306*a58d3d2aSXin Li subsequent page) has no granule position, and the granule position field is 307*a58d3d2aSXin Li set to the special value '-1' in two's complement. 308*a58d3d2aSXin Li</t> 309*a58d3d2aSXin Li 310*a58d3d2aSXin Li<t> 311*a58d3d2aSXin LiThe granule position of an audio data page is in units of PCM audio samples at 312*a58d3d2aSXin Li a fixed rate of 48 kHz (per channel; a stereo stream's granule position 313*a58d3d2aSXin Li does not increment at twice the speed of a mono stream). 314*a58d3d2aSXin LiIt is possible to run an Opus decoder at other sampling rates, 315*a58d3d2aSXin Li but all Opus packets encode samples at a sampling rate that evenly divides 316*a58d3d2aSXin Li 48 kHz. 317*a58d3d2aSXin LiTherefore, the value in the granule position field always counts samples 318*a58d3d2aSXin Li assuming a 48 kHz decoding rate, and the rest of this specification makes 319*a58d3d2aSXin Li the same assumption. 320*a58d3d2aSXin Li</t> 321*a58d3d2aSXin Li 322*a58d3d2aSXin Li<t> 323*a58d3d2aSXin LiThe duration of an Opus packet as defined in <xref target="RFC6716"/> can be 324*a58d3d2aSXin Li any multiple of 2.5 ms, up to a maximum of 120 ms. 325*a58d3d2aSXin LiThis duration is encoded in the TOC sequence at the beginning of each packet. 326*a58d3d2aSXin LiThe number of samples returned by a decoder corresponds to this duration 327*a58d3d2aSXin Li exactly, even for the first few packets. 328*a58d3d2aSXin LiFor example, a 20 ms packet fed to a decoder running at 48 kHz will 329*a58d3d2aSXin Li always return 960 samples. 330*a58d3d2aSXin LiA demuxer can parse the TOC sequence at the beginning of each Ogg packet to 331*a58d3d2aSXin Li work backwards or forwards from a packet with a known granule position (i.e., 332*a58d3d2aSXin Li the last packet completed on some page) in order to assign granule positions 333*a58d3d2aSXin Li to every packet, or even every individual sample. 334*a58d3d2aSXin LiThe one exception is the last page in the stream, as described below. 335*a58d3d2aSXin Li</t> 336*a58d3d2aSXin Li 337*a58d3d2aSXin Li<t> 338*a58d3d2aSXin LiAll other pages with completed packets after the first MUST have a granule 339*a58d3d2aSXin Li position equal to the number of samples contained in packets that complete on 340*a58d3d2aSXin Li that page plus the granule position of the most recent page with completed 341*a58d3d2aSXin Li packets. 342*a58d3d2aSXin LiThis guarantees that a demuxer can assign individual packets the same granule 343*a58d3d2aSXin Li position when working forwards as when working backwards. 344*a58d3d2aSXin LiFor this to work, there cannot be any gaps. 345*a58d3d2aSXin Li</t> 346*a58d3d2aSXin Li 347*a58d3d2aSXin Li<section anchor="gap-repair" title="Repairing Gaps in Real-time Streams"> 348*a58d3d2aSXin Li<t> 349*a58d3d2aSXin LiIn order to support capturing a real-time stream that has lost or not 350*a58d3d2aSXin Li transmitted packets, a multiplexer (muxer) SHOULD emit packets that explicitly 351*a58d3d2aSXin Li request the use of Packet Loss Concealment (PLC) in place of the missing 352*a58d3d2aSXin Li packets. 353*a58d3d2aSXin LiImplementations that fail to do so still MUST NOT increment the granule 354*a58d3d2aSXin Li position for a page by anything other than the number of samples contained in 355*a58d3d2aSXin Li packets that actually complete on that page. 356*a58d3d2aSXin Li</t> 357*a58d3d2aSXin Li<t> 358*a58d3d2aSXin LiOnly gaps that are a multiple of 2.5 ms are repairable, as these are the 359*a58d3d2aSXin Li only durations that can be created by packet loss or discontinuous 360*a58d3d2aSXin Li transmission. 361*a58d3d2aSXin LiMuxers need not handle other gap sizes. 362*a58d3d2aSXin LiCreating the necessary packets involves synthesizing a TOC byte (defined in 363*a58d3d2aSXin LiSection 3.1 of <xref target="RFC6716"/>)—and whatever 364*a58d3d2aSXin Li additional internal framing is needed—to indicate the packet duration 365*a58d3d2aSXin Li for each stream. 366*a58d3d2aSXin LiThe actual length of each missing Opus frame inside the packet is zero bytes, 367*a58d3d2aSXin Li as defined in Section 3.2.1 of <xref target="RFC6716"/>. 368*a58d3d2aSXin Li</t> 369*a58d3d2aSXin Li 370*a58d3d2aSXin Li<t> 371*a58d3d2aSXin LiZero-byte frames MAY be packed into packets using any of codes 0, 1, 372*a58d3d2aSXin Li 2, or 3. 373*a58d3d2aSXin LiWhen successive frames have the same configuration, the higher code packings 374*a58d3d2aSXin Li reduce overhead. 375*a58d3d2aSXin LiLikewise, if the TOC configuration matches, the muxer MAY further combine the 376*a58d3d2aSXin Li empty frames with previous or subsequent non-zero-length frames (using 377*a58d3d2aSXin Li code 2 or VBR code 3). 378*a58d3d2aSXin Li</t> 379*a58d3d2aSXin Li 380*a58d3d2aSXin Li<t> 381*a58d3d2aSXin Li<xref target="RFC6716"/> does not impose any requirements on the PLC, but this 382*a58d3d2aSXin Li section outlines choices that are expected to have a positive influence on 383*a58d3d2aSXin Li most PLC implementations, including the reference implementation. 384*a58d3d2aSXin LiSynthesized TOC sequences SHOULD maintain the same mode, audio bandwidth, 385*a58d3d2aSXin Li channel count, and frame size as the previous packet (if any). 386*a58d3d2aSXin LiThis is the simplest and usually the most well-tested case for the PLC to 387*a58d3d2aSXin Li handle and it covers all losses that do not include a configuration switch, 388*a58d3d2aSXin Li as defined in Section 4.5 of <xref target="RFC6716"/>. 389*a58d3d2aSXin Li</t> 390*a58d3d2aSXin Li 391*a58d3d2aSXin Li<t> 392*a58d3d2aSXin LiWhen a previous packet is available, keeping the audio bandwidth and channel 393*a58d3d2aSXin Li count the same allows the PLC to provide maximum continuity in the concealment 394*a58d3d2aSXin Li data it generates. 395*a58d3d2aSXin LiHowever, if the size of the gap is not a multiple of the most recent frame 396*a58d3d2aSXin Li size, then the frame size will have to change for at least some frames. 397*a58d3d2aSXin LiSuch changes SHOULD be delayed as long as possible to simplify 398*a58d3d2aSXin Li things for PLC implementations. 399*a58d3d2aSXin Li</t> 400*a58d3d2aSXin Li 401*a58d3d2aSXin Li<t> 402*a58d3d2aSXin LiAs an example, a 95 ms gap could be encoded as nineteen 5 ms frames 403*a58d3d2aSXin Li in two bytes with a single CBR code 3 packet. 404*a58d3d2aSXin LiIf the previous frame size was 20 ms, using four 20 ms frames 405*a58d3d2aSXin Li followed by three 5 ms frames requires 4 bytes (plus an extra byte 406*a58d3d2aSXin Li of Ogg lacing overhead), but allows the PLC to use its well-tested steady 407*a58d3d2aSXin Li state behavior for as long as possible. 408*a58d3d2aSXin LiThe total bitrate of the latter approach, including Ogg overhead, is about 409*a58d3d2aSXin Li 0.4 kbps, so the impact on file size is minimal. 410*a58d3d2aSXin Li</t> 411*a58d3d2aSXin Li 412*a58d3d2aSXin Li<t> 413*a58d3d2aSXin LiChanging modes is discouraged, since this causes some decoder implementations 414*a58d3d2aSXin Li to reset their PLC state. 415*a58d3d2aSXin LiHowever, SILK and Hybrid mode frames cannot fill gaps that are not a multiple 416*a58d3d2aSXin Li of 10 ms. 417*a58d3d2aSXin LiIf switching to CELT mode is needed to match the gap size, a muxer SHOULD do 418*a58d3d2aSXin Li so at the end of the gap to allow the PLC to function for as long as possible. 419*a58d3d2aSXin Li</t> 420*a58d3d2aSXin Li 421*a58d3d2aSXin Li<t> 422*a58d3d2aSXin LiIn the example above, if the previous frame was a 20 ms SILK mode frame, 423*a58d3d2aSXin Li the better solution is to synthesize a packet describing four 20 ms SILK 424*a58d3d2aSXin Li frames, followed by a packet with a single 10 ms SILK 425*a58d3d2aSXin Li frame, and finally a packet with a 5 ms CELT frame, to fill the 95 ms 426*a58d3d2aSXin Li gap. 427*a58d3d2aSXin LiThis also requires four bytes to describe the synthesized packet data (two 428*a58d3d2aSXin Li bytes for a CBR code 3 and one byte each for two code 0 packets) but three 429*a58d3d2aSXin Li bytes of Ogg lacing overhead are needed to mark the packet boundaries. 430*a58d3d2aSXin LiAt 0.6 kbps, this is still a minimal bitrate impact over a naive, low quality 431*a58d3d2aSXin Li solution. 432*a58d3d2aSXin Li</t> 433*a58d3d2aSXin Li 434*a58d3d2aSXin Li<t> 435*a58d3d2aSXin LiSince medium-band audio is an option only in the SILK mode, wideband frames 436*a58d3d2aSXin Li SHOULD be generated if switching from that configuration to CELT mode, to 437*a58d3d2aSXin Li ensure that any PLC implementation which does try to migrate state between 438*a58d3d2aSXin Li the modes will be able to preserve all of the available audio bandwidth. 439*a58d3d2aSXin Li</t> 440*a58d3d2aSXin Li 441*a58d3d2aSXin Li</section> 442*a58d3d2aSXin Li 443*a58d3d2aSXin Li<section anchor="preskip" title="Pre-skip"> 444*a58d3d2aSXin Li<t> 445*a58d3d2aSXin LiThere is some amount of latency introduced during the decoding process, to 446*a58d3d2aSXin Li allow for overlap in the CELT mode, stereo mixing in the SILK mode, and 447*a58d3d2aSXin Li resampling. 448*a58d3d2aSXin LiThe encoder might have introduced additional latency through its own resampling 449*a58d3d2aSXin Li and analysis (though the exact amount is not specified). 450*a58d3d2aSXin LiTherefore, the first few samples produced by the decoder do not correspond to 451*a58d3d2aSXin Li real input audio, but are instead composed of padding inserted by the encoder 452*a58d3d2aSXin Li to compensate for this latency. 453*a58d3d2aSXin LiThese samples need to be stored and decoded, as Opus is an asymptotically 454*a58d3d2aSXin Li convergent predictive codec, meaning the decoded contents of each frame depend 455*a58d3d2aSXin Li on the recent history of decoder inputs. 456*a58d3d2aSXin LiHowever, a player will want to skip these samples after decoding them. 457*a58d3d2aSXin Li</t> 458*a58d3d2aSXin Li 459*a58d3d2aSXin Li<t> 460*a58d3d2aSXin LiA 'pre-skip' field in the ID header (see <xref target="id_header"/>) signals 461*a58d3d2aSXin Li the number of samples that SHOULD be skipped (decoded but discarded) at the 462*a58d3d2aSXin Li beginning of the stream, though some specific applications might have a reason 463*a58d3d2aSXin Li for looking at that data. 464*a58d3d2aSXin LiThis amount need not be a multiple of 2.5 ms, MAY be smaller than a single 465*a58d3d2aSXin Li packet, or MAY span the contents of several packets. 466*a58d3d2aSXin LiThese samples are not valid audio. 467*a58d3d2aSXin Li</t> 468*a58d3d2aSXin Li 469*a58d3d2aSXin Li<t> 470*a58d3d2aSXin LiFor example, if the first Opus frame uses the CELT mode, it will always 471*a58d3d2aSXin Li produce 120 samples of windowed overlap-add data. 472*a58d3d2aSXin LiHowever, the overlap data is initially all zeros (since there is no prior 473*a58d3d2aSXin Li frame), meaning this cannot, in general, accurately represent the original 474*a58d3d2aSXin Li audio. 475*a58d3d2aSXin LiThe SILK mode requires additional delay to account for its analysis and 476*a58d3d2aSXin Li resampling latency. 477*a58d3d2aSXin LiThe encoder delays the original audio to avoid this problem. 478*a58d3d2aSXin Li</t> 479*a58d3d2aSXin Li 480*a58d3d2aSXin Li<t> 481*a58d3d2aSXin LiThe pre-skip field MAY also be used to perform sample-accurate cropping of 482*a58d3d2aSXin Li already encoded streams. 483*a58d3d2aSXin LiIn this case, a value of at least 3840 samples (80 ms) provides 484*a58d3d2aSXin Li sufficient history to the decoder that it will have converged 485*a58d3d2aSXin Li before the stream's output begins. 486*a58d3d2aSXin Li</t> 487*a58d3d2aSXin Li 488*a58d3d2aSXin Li</section> 489*a58d3d2aSXin Li 490*a58d3d2aSXin Li<section anchor="pcm_sample_position" title="PCM Sample Position"> 491*a58d3d2aSXin Li<t> 492*a58d3d2aSXin LiThe PCM sample position is determined from the granule position using the 493*a58d3d2aSXin Li formula 494*a58d3d2aSXin Li</t> 495*a58d3d2aSXin Li<figure align="center"> 496*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 497*a58d3d2aSXin Li'PCM sample position' = 'granule position' - 'pre-skip' . 498*a58d3d2aSXin Li]]></artwork> 499*a58d3d2aSXin Li</figure> 500*a58d3d2aSXin Li 501*a58d3d2aSXin Li<t> 502*a58d3d2aSXin LiFor example, if the granule position of the first audio data page is 59,971, 503*a58d3d2aSXin Li and the pre-skip is 11,971, then the PCM sample position of the last decoded 504*a58d3d2aSXin Li sample from that page is 48,000. 505*a58d3d2aSXin Li</t> 506*a58d3d2aSXin Li<t> 507*a58d3d2aSXin LiThis can be converted into a playback time using the formula 508*a58d3d2aSXin Li</t> 509*a58d3d2aSXin Li<figure align="center"> 510*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 511*a58d3d2aSXin Li 'PCM sample position' 512*a58d3d2aSXin Li'playback time' = --------------------- . 513*a58d3d2aSXin Li 48000.0 514*a58d3d2aSXin Li]]></artwork> 515*a58d3d2aSXin Li</figure> 516*a58d3d2aSXin Li 517*a58d3d2aSXin Li<t> 518*a58d3d2aSXin LiThe initial PCM sample position before any samples are played is normally '0'. 519*a58d3d2aSXin LiIn this case, the PCM sample position of the first audio sample to be played 520*a58d3d2aSXin Li starts at '1', because it marks the time on the clock 521*a58d3d2aSXin Li <spanx style="emph">after</spanx> that sample has been played, and a stream 522*a58d3d2aSXin Li that is exactly one second long has a final PCM sample position of '48000', 523*a58d3d2aSXin Li as in the example here. 524*a58d3d2aSXin Li</t> 525*a58d3d2aSXin Li 526*a58d3d2aSXin Li<t> 527*a58d3d2aSXin LiVorbis streams use a granule position smaller than the number of audio samples 528*a58d3d2aSXin Li contained in the first audio data page to indicate that some of those samples 529*a58d3d2aSXin Li are trimmed from the output (see <xref target="vorbis-trim"/>). 530*a58d3d2aSXin LiHowever, to do so, Vorbis requires that the first audio data page contains 531*a58d3d2aSXin Li exactly two packets, in order to allow the decoder to perform PCM position 532*a58d3d2aSXin Li adjustments before needing to return any PCM data. 533*a58d3d2aSXin LiOpus uses the pre-skip mechanism for this purpose instead, since the encoder 534*a58d3d2aSXin Li might introduce more than a single packet's worth of latency, and since very 535*a58d3d2aSXin Li large packets in streams with a very large number of channels might not fit 536*a58d3d2aSXin Li on a single page. 537*a58d3d2aSXin Li</t> 538*a58d3d2aSXin Li</section> 539*a58d3d2aSXin Li 540*a58d3d2aSXin Li<section anchor="end_trimming" title="End Trimming"> 541*a58d3d2aSXin Li<t> 542*a58d3d2aSXin LiThe page with the 'end of stream' flag set MAY have a granule position that 543*a58d3d2aSXin Li indicates the page contains less audio data than would normally be returned by 544*a58d3d2aSXin Li decoding up through the final packet. 545*a58d3d2aSXin LiThis is used to end the stream somewhere other than an even frame boundary. 546*a58d3d2aSXin LiThe granule position of the most recent audio data page with completed packets 547*a58d3d2aSXin Li is used to make this determination, or '0' is used if there were no previous 548*a58d3d2aSXin Li audio data pages with a completed packet. 549*a58d3d2aSXin LiThe difference between these granule positions indicates how many samples to 550*a58d3d2aSXin Li keep after decoding the packets that completed on the final page. 551*a58d3d2aSXin LiThe remaining samples are discarded. 552*a58d3d2aSXin LiThe number of discarded samples SHOULD be no larger than the number decoded 553*a58d3d2aSXin Li from the last packet. 554*a58d3d2aSXin Li</t> 555*a58d3d2aSXin Li</section> 556*a58d3d2aSXin Li 557*a58d3d2aSXin Li<section anchor="start_granpos_restrictions" 558*a58d3d2aSXin Li title="Restrictions on the Initial Granule Position"> 559*a58d3d2aSXin Li<t> 560*a58d3d2aSXin LiThe granule position of the first audio data page with a completed packet MAY 561*a58d3d2aSXin Li be larger than the number of samples contained in packets that complete on 562*a58d3d2aSXin Li that page, however it MUST NOT be smaller, unless that page has the 'end of 563*a58d3d2aSXin Li stream' flag set. 564*a58d3d2aSXin LiAllowing a granule position larger than the number of samples allows the 565*a58d3d2aSXin Li beginning of a stream to be cropped or a live stream to be joined without 566*a58d3d2aSXin Li rewriting the granule position of all the remaining pages. 567*a58d3d2aSXin LiThis means that the PCM sample position just before the first sample to be 568*a58d3d2aSXin Li played MAY be larger than '0'. 569*a58d3d2aSXin LiSynchronization when multiplexing with other logical streams still uses the PCM 570*a58d3d2aSXin Li sample position relative to '0' to compute sample times. 571*a58d3d2aSXin LiThis does not affect the behavior of pre-skip: exactly 'pre-skip' samples 572*a58d3d2aSXin Li SHOULD be skipped from the beginning of the decoded output, even if the 573*a58d3d2aSXin Li initial PCM sample position is greater than zero. 574*a58d3d2aSXin Li</t> 575*a58d3d2aSXin Li 576*a58d3d2aSXin Li<t> 577*a58d3d2aSXin LiOn the other hand, a granule position that is smaller than the number of 578*a58d3d2aSXin Li decoded samples prevents a demuxer from working backwards to assign each 579*a58d3d2aSXin Li packet or each individual sample a valid granule position, since granule 580*a58d3d2aSXin Li positions are non-negative. 581*a58d3d2aSXin LiAn implementation MUST treat any stream as invalid if the granule position 582*a58d3d2aSXin Li is smaller than the number of samples contained in packets that complete on 583*a58d3d2aSXin Li the first audio data page with a completed packet, unless that page has the 584*a58d3d2aSXin Li 'end of stream' flag set. 585*a58d3d2aSXin LiIt MAY defer this action until it decodes the last packet completed on that 586*a58d3d2aSXin Li page. 587*a58d3d2aSXin Li</t> 588*a58d3d2aSXin Li 589*a58d3d2aSXin Li<t> 590*a58d3d2aSXin LiIf that page has the 'end of stream' flag set, a demuxer MUST treat any stream 591*a58d3d2aSXin Li as invalid if its granule position is smaller than the 'pre-skip' amount. 592*a58d3d2aSXin LiThis would indicate that there are more samples to be skipped from the initial 593*a58d3d2aSXin Li decoded output than exist in the stream. 594*a58d3d2aSXin LiIf the granule position is smaller than the number of decoded samples produced 595*a58d3d2aSXin Li by the packets that complete on that page, then a demuxer MUST use an initial 596*a58d3d2aSXin Li granule position of '0', and can work forwards from '0' to timestamp 597*a58d3d2aSXin Li individual packets. 598*a58d3d2aSXin LiIf the granule position is larger than the number of decoded samples available, 599*a58d3d2aSXin Li then the demuxer MUST still work backwards as described above, even if the 600*a58d3d2aSXin Li 'end of stream' flag is set, to determine the initial granule position, and 601*a58d3d2aSXin Li thus the initial PCM sample position. 602*a58d3d2aSXin LiBoth of these will be greater than '0' in this case. 603*a58d3d2aSXin Li</t> 604*a58d3d2aSXin Li</section> 605*a58d3d2aSXin Li 606*a58d3d2aSXin Li<section anchor="seeking_and_preroll" title="Seeking and Pre-roll"> 607*a58d3d2aSXin Li<t> 608*a58d3d2aSXin LiSeeking in Ogg files is best performed using a bisection search for a page 609*a58d3d2aSXin Li whose granule position corresponds to a PCM position at or before the seek 610*a58d3d2aSXin Li target. 611*a58d3d2aSXin LiWith appropriately weighted bisection, accurate seeking can be performed in 612*a58d3d2aSXin Li just one or two bisections on average, even in multi-gigabyte files. 613*a58d3d2aSXin LiSee <xref target="seeking"/> for an example of general implementation guidance. 614*a58d3d2aSXin Li</t> 615*a58d3d2aSXin Li 616*a58d3d2aSXin Li<t> 617*a58d3d2aSXin LiWhen seeking within an Ogg Opus stream, an implementation SHOULD start decoding 618*a58d3d2aSXin Li (and discarding the output) at least 3840 samples (80 ms) prior to 619*a58d3d2aSXin Li the seek target in order to ensure that the output audio is correct by the 620*a58d3d2aSXin Li time it reaches the seek target. 621*a58d3d2aSXin LiThis 'pre-roll' is separate from, and unrelated to, the 'pre-skip' used at the 622*a58d3d2aSXin Li beginning of the stream. 623*a58d3d2aSXin LiIf the point 80 ms prior to the seek target comes before the initial PCM 624*a58d3d2aSXin Li sample position, an implementation SHOULD start decoding from the beginning of 625*a58d3d2aSXin Li the stream, applying pre-skip as normal, regardless of whether the pre-skip is 626*a58d3d2aSXin Li larger or smaller than 80 ms, and then continue to discard samples 627*a58d3d2aSXin Li to reach the seek target (if any). 628*a58d3d2aSXin Li</t> 629*a58d3d2aSXin Li</section> 630*a58d3d2aSXin Li 631*a58d3d2aSXin Li</section> 632*a58d3d2aSXin Li 633*a58d3d2aSXin Li<section anchor="headers" title="Header Packets"> 634*a58d3d2aSXin Li<t> 635*a58d3d2aSXin LiAn Ogg Opus logical stream contains exactly two mandatory header packets: 636*a58d3d2aSXin Li an identification header and a comment header. 637*a58d3d2aSXin Li</t> 638*a58d3d2aSXin Li 639*a58d3d2aSXin Li<section anchor="id_header" title="Identification Header"> 640*a58d3d2aSXin Li 641*a58d3d2aSXin Li<figure anchor="id_header_packet" title="ID Header Packet" align="center"> 642*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 643*a58d3d2aSXin Li 0 1 2 3 644*a58d3d2aSXin Li 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 645*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 646*a58d3d2aSXin Li| 'O' | 'p' | 'u' | 's' | 647*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 648*a58d3d2aSXin Li| 'H' | 'e' | 'a' | 'd' | 649*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 650*a58d3d2aSXin Li| Version = 1 | Channel Count | Pre-skip | 651*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 652*a58d3d2aSXin Li| Input Sample Rate (Hz) | 653*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 654*a58d3d2aSXin Li| Output Gain (Q7.8 in dB) | Mapping Family| | 655*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ : 656*a58d3d2aSXin Li| | 657*a58d3d2aSXin Li: Optional Channel Mapping Table... : 658*a58d3d2aSXin Li| | 659*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 660*a58d3d2aSXin Li]]></artwork> 661*a58d3d2aSXin Li</figure> 662*a58d3d2aSXin Li 663*a58d3d2aSXin Li<t> 664*a58d3d2aSXin LiThe fields in the identification (ID) header have the following meaning: 665*a58d3d2aSXin Li<list style="numbers"> 666*a58d3d2aSXin Li<t>Magic Signature: 667*a58d3d2aSXin Li<vspace blankLines="1"/> 668*a58d3d2aSXin LiThis is an 8-octet (64-bit) field that allows codec identification and is 669*a58d3d2aSXin Li human-readable. 670*a58d3d2aSXin LiIt contains, in order, the magic numbers: 671*a58d3d2aSXin Li<list style="empty"> 672*a58d3d2aSXin Li<t>0x4F 'O'</t> 673*a58d3d2aSXin Li<t>0x70 'p'</t> 674*a58d3d2aSXin Li<t>0x75 'u'</t> 675*a58d3d2aSXin Li<t>0x73 's'</t> 676*a58d3d2aSXin Li<t>0x48 'H'</t> 677*a58d3d2aSXin Li<t>0x65 'e'</t> 678*a58d3d2aSXin Li<t>0x61 'a'</t> 679*a58d3d2aSXin Li<t>0x64 'd'</t> 680*a58d3d2aSXin Li</list> 681*a58d3d2aSXin LiStarting with "Op" helps distinguish it from audio data packets, as this is an 682*a58d3d2aSXin Li invalid TOC sequence. 683*a58d3d2aSXin Li<vspace blankLines="1"/> 684*a58d3d2aSXin Li</t> 685*a58d3d2aSXin Li<t>Version (8 bits, unsigned): 686*a58d3d2aSXin Li<vspace blankLines="1"/> 687*a58d3d2aSXin LiThe version number MUST always be '1' for this version of the encapsulation 688*a58d3d2aSXin Li specification. 689*a58d3d2aSXin LiImplementations SHOULD treat streams where the upper four bits of the version 690*a58d3d2aSXin Li number match that of a recognized specification as backwards-compatible with 691*a58d3d2aSXin Li that specification. 692*a58d3d2aSXin LiThat is, the version number can be split into "major" and "minor" version 693*a58d3d2aSXin Li sub-fields, with changes to the "minor" sub-field (in the lower four bits) 694*a58d3d2aSXin Li signaling compatible changes. 695*a58d3d2aSXin LiFor example, an implementation of this specification SHOULD accept any stream 696*a58d3d2aSXin Li with a version number of '15' or less, and SHOULD assume any stream with a 697*a58d3d2aSXin Li version number '16' or greater is incompatible. 698*a58d3d2aSXin LiThe initial version '1' was chosen to keep implementations from relying on this 699*a58d3d2aSXin Li octet as a null terminator for the "OpusHead" string. 700*a58d3d2aSXin Li<vspace blankLines="1"/> 701*a58d3d2aSXin Li</t> 702*a58d3d2aSXin Li<t>Output Channel Count 'C' (8 bits, unsigned): 703*a58d3d2aSXin Li<vspace blankLines="1"/> 704*a58d3d2aSXin LiThis is the number of output channels. 705*a58d3d2aSXin LiThis might be different than the number of encoded channels, which can change 706*a58d3d2aSXin Li on a packet-by-packet basis. 707*a58d3d2aSXin LiThis value MUST NOT be zero. 708*a58d3d2aSXin LiThe maximum allowable value depends on the channel mapping family, and might be 709*a58d3d2aSXin Li as large as 255. 710*a58d3d2aSXin LiSee <xref target="channel_mapping"/> for details. 711*a58d3d2aSXin Li<vspace blankLines="1"/> 712*a58d3d2aSXin Li</t> 713*a58d3d2aSXin Li<t>Pre-skip (16 bits, unsigned, little 714*a58d3d2aSXin Li endian): 715*a58d3d2aSXin Li<vspace blankLines="1"/> 716*a58d3d2aSXin LiThis is the number of samples (at 48 kHz) to discard from the decoder 717*a58d3d2aSXin Li output when starting playback, and also the number to subtract from a page's 718*a58d3d2aSXin Li granule position to calculate its PCM sample position. 719*a58d3d2aSXin LiWhen cropping the beginning of existing Ogg Opus streams, a pre-skip of at 720*a58d3d2aSXin Li least 3,840 samples (80 ms) is RECOMMENDED to ensure complete 721*a58d3d2aSXin Li convergence in the decoder. 722*a58d3d2aSXin Li<vspace blankLines="1"/> 723*a58d3d2aSXin Li</t> 724*a58d3d2aSXin Li<t>Input Sample Rate (32 bits, unsigned, little 725*a58d3d2aSXin Li endian): 726*a58d3d2aSXin Li<vspace blankLines="1"/> 727*a58d3d2aSXin LiThis is the sample rate of the original input (before encoding), in Hz. 728*a58d3d2aSXin LiThis field is <spanx style="emph">not</spanx> the sample rate to use for 729*a58d3d2aSXin Li playback of the encoded data. 730*a58d3d2aSXin Li<vspace blankLines="1"/> 731*a58d3d2aSXin LiOpus can switch between internal audio bandwidths of 4, 6, 8, 12, and 732*a58d3d2aSXin Li 20 kHz. 733*a58d3d2aSXin LiEach packet in the stream can have a different audio bandwidth. 734*a58d3d2aSXin LiRegardless of the audio bandwidth, the reference decoder supports decoding any 735*a58d3d2aSXin Li stream at a sample rate of 8, 12, 16, 24, or 48 kHz. 736*a58d3d2aSXin LiThe original sample rate of the audio passed to the encoder is not preserved 737*a58d3d2aSXin Li by the lossy compression. 738*a58d3d2aSXin Li<vspace blankLines="1"/> 739*a58d3d2aSXin LiAn Ogg Opus player SHOULD select the playback sample rate according to the 740*a58d3d2aSXin Li following procedure: 741*a58d3d2aSXin Li<list style="numbers"> 742*a58d3d2aSXin Li<t>If the hardware supports 48 kHz playback, decode at 48 kHz.</t> 743*a58d3d2aSXin Li<t>Otherwise, if the hardware's highest available sample rate is a supported 744*a58d3d2aSXin Li rate, decode at this sample rate.</t> 745*a58d3d2aSXin Li<t>Otherwise, if the hardware's highest available sample rate is less than 746*a58d3d2aSXin Li 48 kHz, decode at the next higher Opus supported rate above the highest 747*a58d3d2aSXin Li available hardware rate and resample.</t> 748*a58d3d2aSXin Li<t>Otherwise, decode at 48 kHz and resample.</t> 749*a58d3d2aSXin Li</list> 750*a58d3d2aSXin LiHowever, the 'Input Sample Rate' field allows the muxer to pass the sample 751*a58d3d2aSXin Li rate of the original input stream as metadata. 752*a58d3d2aSXin LiThis is useful when the user requires the output sample rate to match the 753*a58d3d2aSXin Li input sample rate. 754*a58d3d2aSXin LiFor example, when not playing the output, an implementation writing PCM format 755*a58d3d2aSXin Li samples to disk might choose to resample the audio back to the original input 756*a58d3d2aSXin Li sample rate to reduce surprise to the user, who might reasonably expect to get 757*a58d3d2aSXin Li back a file with the same sample rate. 758*a58d3d2aSXin Li<vspace blankLines="1"/> 759*a58d3d2aSXin LiA value of zero indicates 'unspecified'. 760*a58d3d2aSXin LiMuxers SHOULD write the actual input sample rate or zero, but implementations 761*a58d3d2aSXin Li which do something with this field SHOULD take care to behave sanely if given 762*a58d3d2aSXin Li crazy values (e.g., do not actually upsample the output to 10 MHz if 763*a58d3d2aSXin Li requested). 764*a58d3d2aSXin LiImplementations SHOULD support input sample rates between 8 kHz and 765*a58d3d2aSXin Li 192 kHz (inclusive). 766*a58d3d2aSXin LiRates outside this range MAY be ignored by falling back to the default rate of 767*a58d3d2aSXin Li 48 kHz instead. 768*a58d3d2aSXin Li<vspace blankLines="1"/> 769*a58d3d2aSXin Li</t> 770*a58d3d2aSXin Li<t>Output Gain (16 bits, signed, little endian): 771*a58d3d2aSXin Li<vspace blankLines="1"/> 772*a58d3d2aSXin LiThis is a gain to be applied when decoding. 773*a58d3d2aSXin LiIt is 20*log10 of the factor by which to scale the decoder output to achieve 774*a58d3d2aSXin Li the desired playback volume, stored in a 16-bit, signed, two's complement 775*a58d3d2aSXin Li fixed-point value with 8 fractional bits (i.e., 776*a58d3d2aSXin Li Q7.8 <xref target="q-notation"/>). 777*a58d3d2aSXin Li<vspace blankLines="1"/> 778*a58d3d2aSXin LiTo apply the gain, an implementation could use 779*a58d3d2aSXin Li<figure align="center"> 780*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 781*a58d3d2aSXin Lisample *= pow(10, output_gain/(20.0*256)) , 782*a58d3d2aSXin Li]]></artwork> 783*a58d3d2aSXin Li</figure> 784*a58d3d2aSXin Li where output_gain is the raw 16-bit value from the header. 785*a58d3d2aSXin Li<vspace blankLines="1"/> 786*a58d3d2aSXin LiPlayers and media frameworks SHOULD apply it by default. 787*a58d3d2aSXin LiIf a player chooses to apply any volume adjustment or gain modification, such 788*a58d3d2aSXin Li as the R128_TRACK_GAIN (see <xref target="comment_header"/>), the adjustment 789*a58d3d2aSXin Li MUST be applied in addition to this output gain in order to achieve playback 790*a58d3d2aSXin Li at the normalized volume. 791*a58d3d2aSXin Li<vspace blankLines="1"/> 792*a58d3d2aSXin LiA muxer SHOULD set this field to zero, and instead apply any gain prior to 793*a58d3d2aSXin Li encoding, when this is possible and does not conflict with the user's wishes. 794*a58d3d2aSXin LiA nonzero output gain indicates the gain was adjusted after encoding, or that 795*a58d3d2aSXin Li a user wished to adjust the gain for playback while preserving the ability 796*a58d3d2aSXin Li to recover the original signal amplitude. 797*a58d3d2aSXin Li<vspace blankLines="1"/> 798*a58d3d2aSXin LiAlthough the output gain has enormous range (+/- 128 dB, enough to amplify 799*a58d3d2aSXin Li inaudible sounds to the threshold of physical pain), most applications can 800*a58d3d2aSXin Li only reasonably use a small portion of this range around zero. 801*a58d3d2aSXin LiThe large range serves in part to ensure that gain can always be losslessly 802*a58d3d2aSXin Li transferred between OpusHead and R128 gain tags (see below) without 803*a58d3d2aSXin Li saturating. 804*a58d3d2aSXin Li<vspace blankLines="1"/> 805*a58d3d2aSXin Li</t> 806*a58d3d2aSXin Li<t>Channel Mapping Family (8 bits, unsigned): 807*a58d3d2aSXin Li<vspace blankLines="1"/> 808*a58d3d2aSXin LiThis octet indicates the order and semantic meaning of the output channels. 809*a58d3d2aSXin Li<vspace blankLines="1"/> 810*a58d3d2aSXin LiEach currently specified value of this octet indicates a mapping family, which 811*a58d3d2aSXin Li defines a set of allowed channel counts, and the ordered set of channel names 812*a58d3d2aSXin Li for each allowed channel count. 813*a58d3d2aSXin LiThe details are described in <xref target="channel_mapping"/>. 814*a58d3d2aSXin Li</t> 815*a58d3d2aSXin Li<t>Channel Mapping Table: 816*a58d3d2aSXin LiThis table defines the mapping from encoded streams to output channels. 817*a58d3d2aSXin LiIts contents are specified in <xref target="channel_mapping"/>. 818*a58d3d2aSXin Li</t> 819*a58d3d2aSXin Li</list> 820*a58d3d2aSXin Li</t> 821*a58d3d2aSXin Li 822*a58d3d2aSXin Li<t> 823*a58d3d2aSXin LiAll fields in the ID headers are REQUIRED, except for the channel mapping 824*a58d3d2aSXin Li table, which MUST be omitted when the channel mapping family is 0, but 825*a58d3d2aSXin Li is REQUIRED otherwise. 826*a58d3d2aSXin LiImplementations SHOULD treat a stream as invalid if it contains an ID header 827*a58d3d2aSXin Li that does not have enough data for these fields, even if it contain a valid 828*a58d3d2aSXin Li Magic Signature. 829*a58d3d2aSXin LiFuture versions of this specification, even backwards-compatible versions, 830*a58d3d2aSXin Li might include additional fields in the ID header. 831*a58d3d2aSXin LiIf an ID header has a compatible major version, but a larger minor version, 832*a58d3d2aSXin Li an implementation MUST NOT treat it as invalid for containing additional data 833*a58d3d2aSXin Li not specified here, provided it still completes on the first page. 834*a58d3d2aSXin Li</t> 835*a58d3d2aSXin Li 836*a58d3d2aSXin Li<section anchor="channel_mapping" title="Channel Mapping"> 837*a58d3d2aSXin Li<t> 838*a58d3d2aSXin LiAn Ogg Opus stream allows mapping one number of Opus streams (N) to a possibly 839*a58d3d2aSXin Li larger number of decoded channels (M + N) to yet another number of 840*a58d3d2aSXin Li output channels (C), which might be larger or smaller than the number of 841*a58d3d2aSXin Li decoded channels. 842*a58d3d2aSXin LiThe order and meaning of these channels are defined by a channel mapping, 843*a58d3d2aSXin Li which consists of the 'channel mapping family' octet and, for channel mapping 844*a58d3d2aSXin Li families other than family 0, a channel mapping table, as illustrated in 845*a58d3d2aSXin Li <xref target="channel_mapping_table"/>. 846*a58d3d2aSXin Li</t> 847*a58d3d2aSXin Li 848*a58d3d2aSXin Li<figure anchor="channel_mapping_table" title="Channel Mapping Table" 849*a58d3d2aSXin Li align="center"> 850*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 851*a58d3d2aSXin Li 0 1 2 3 852*a58d3d2aSXin Li 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 853*a58d3d2aSXin Li +-+-+-+-+-+-+-+-+ 854*a58d3d2aSXin Li | Stream Count | 855*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 856*a58d3d2aSXin Li| Coupled Count | Channel Mapping... : 857*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 858*a58d3d2aSXin Li]]></artwork> 859*a58d3d2aSXin Li</figure> 860*a58d3d2aSXin Li 861*a58d3d2aSXin Li<t> 862*a58d3d2aSXin LiThe fields in the channel mapping table have the following meaning: 863*a58d3d2aSXin Li<list style="numbers" counter="8"> 864*a58d3d2aSXin Li<t>Stream Count 'N' (8 bits, unsigned): 865*a58d3d2aSXin Li<vspace blankLines="1"/> 866*a58d3d2aSXin LiThis is the total number of streams encoded in each Ogg packet. 867*a58d3d2aSXin LiThis value is necessary to correctly parse the packed Opus packets inside an 868*a58d3d2aSXin Li Ogg packet, as described in <xref target="packet_organization"/>. 869*a58d3d2aSXin LiThis value MUST NOT be zero, as without at least one Opus packet with a valid 870*a58d3d2aSXin Li TOC sequence, a demuxer cannot recover the duration of an Ogg packet. 871*a58d3d2aSXin Li<vspace blankLines="1"/> 872*a58d3d2aSXin LiFor channel mapping family 0, this value defaults to 1, and is not coded. 873*a58d3d2aSXin Li<vspace blankLines="1"/> 874*a58d3d2aSXin Li</t> 875*a58d3d2aSXin Li<t>Coupled Stream Count 'M' (8 bits, unsigned): 876*a58d3d2aSXin LiThis is the number of streams whose decoders are to be configured to produce 877*a58d3d2aSXin Li two channels (stereo). 878*a58d3d2aSXin LiThis MUST be no larger than the total number of streams, N. 879*a58d3d2aSXin Li<vspace blankLines="1"/> 880*a58d3d2aSXin LiEach packet in an Opus stream has an internal channel count of 1 or 2, which 881*a58d3d2aSXin Li can change from packet to packet. 882*a58d3d2aSXin LiThis is selected by the encoder depending on the bitrate and the audio being 883*a58d3d2aSXin Li encoded. 884*a58d3d2aSXin LiThe original channel count of the audio passed to the encoder is not 885*a58d3d2aSXin Li necessarily preserved by the lossy compression. 886*a58d3d2aSXin Li<vspace blankLines="1"/> 887*a58d3d2aSXin LiRegardless of the internal channel count, any Opus stream can be decoded as 888*a58d3d2aSXin Li mono (a single channel) or stereo (two channels) by appropriate initialization 889*a58d3d2aSXin Li of the decoder. 890*a58d3d2aSXin LiThe 'coupled stream count' field indicates that the decoders for the first M 891*a58d3d2aSXin Li Opus streams are to be initialized for stereo (two-channel) output, and the 892*a58d3d2aSXin Li remaining (N - M) decoders are to be initialized for mono (a single 893*a58d3d2aSXin Li channel) only. 894*a58d3d2aSXin LiThe total number of decoded channels, (M + N), MUST be no larger than 895*a58d3d2aSXin Li 255, as there is no way to index more channels than that in the channel 896*a58d3d2aSXin Li mapping. 897*a58d3d2aSXin Li<vspace blankLines="1"/> 898*a58d3d2aSXin LiFor channel mapping family 0, this value defaults to (C - 1) 899*a58d3d2aSXin Li (i.e., 0 for mono and 1 for stereo), and is not coded. 900*a58d3d2aSXin Li<vspace blankLines="1"/> 901*a58d3d2aSXin Li</t> 902*a58d3d2aSXin Li<t>Channel Mapping (8*C bits): 903*a58d3d2aSXin LiThis contains one octet per output channel, indicating which decoded channel 904*a58d3d2aSXin Li is to be used for each one. 905*a58d3d2aSXin LiLet 'index' be the value of this octet for a particular output channel. 906*a58d3d2aSXin LiThis value MUST either be smaller than (M + N), or be the special 907*a58d3d2aSXin Li value 255. 908*a58d3d2aSXin LiIf 'index' is less than 2*M, the output MUST be taken from decoding stream 909*a58d3d2aSXin Li ('index'/2) as stereo and selecting the left channel if 'index' is even, and 910*a58d3d2aSXin Li the right channel if 'index' is odd. 911*a58d3d2aSXin LiIf 'index' is 2*M or larger, but less than 255, the output MUST be taken from 912*a58d3d2aSXin Li decoding stream ('index' - M) as mono. 913*a58d3d2aSXin LiIf 'index' is 255, the corresponding output channel MUST contain pure silence. 914*a58d3d2aSXin Li<vspace blankLines="1"/> 915*a58d3d2aSXin LiThe number of output channels, C, is not constrained to match the number of 916*a58d3d2aSXin Li decoded channels (M + N). 917*a58d3d2aSXin LiA single index value MAY appear multiple times, i.e., the same decoded channel 918*a58d3d2aSXin Li might be mapped to multiple output channels. 919*a58d3d2aSXin LiSome decoded channels might not be assigned to any output channel, as well. 920*a58d3d2aSXin Li<vspace blankLines="1"/> 921*a58d3d2aSXin LiFor channel mapping family 0, the first index defaults to 0, and if 922*a58d3d2aSXin Li C == 2, the second index defaults to 1. 923*a58d3d2aSXin LiNeither index is coded. 924*a58d3d2aSXin Li</t> 925*a58d3d2aSXin Li</list> 926*a58d3d2aSXin Li</t> 927*a58d3d2aSXin Li 928*a58d3d2aSXin Li<t> 929*a58d3d2aSXin LiAfter producing the output channels, the channel mapping family determines the 930*a58d3d2aSXin Li semantic meaning of each one. 931*a58d3d2aSXin LiThere are three defined mapping families in this specification. 932*a58d3d2aSXin Li</t> 933*a58d3d2aSXin Li 934*a58d3d2aSXin Li<section anchor="channel_mapping_0" title="Channel Mapping Family 0"> 935*a58d3d2aSXin Li<t> 936*a58d3d2aSXin LiAllowed numbers of channels: 1 or 2. 937*a58d3d2aSXin LiRTP mapping. 938*a58d3d2aSXin LiThis is the same channel interpretation as <xref target="RFC7587"/>. 939*a58d3d2aSXin Li</t> 940*a58d3d2aSXin Li<t> 941*a58d3d2aSXin Li<list style="symbols"> 942*a58d3d2aSXin Li<t>1 channel: monophonic (mono).</t> 943*a58d3d2aSXin Li<t>2 channels: stereo (left, right).</t> 944*a58d3d2aSXin Li</list> 945*a58d3d2aSXin LiSpecial mapping: This channel mapping value also 946*a58d3d2aSXin Li indicates that the contents consists of a single Opus stream that is stereo if 947*a58d3d2aSXin Li and only if C == 2, with stream index 0 mapped to output 948*a58d3d2aSXin Li channel 0 (mono, or left channel) and stream index 1 mapped to 949*a58d3d2aSXin Li output channel 1 (right channel) if stereo. 950*a58d3d2aSXin LiWhen the 'channel mapping family' octet has this value, the channel mapping 951*a58d3d2aSXin Li table MUST be omitted from the ID header packet. 952*a58d3d2aSXin Li</t> 953*a58d3d2aSXin Li</section> 954*a58d3d2aSXin Li 955*a58d3d2aSXin Li<section anchor="channel_mapping_1" title="Channel Mapping Family 1"> 956*a58d3d2aSXin Li<t> 957*a58d3d2aSXin LiAllowed numbers of channels: 1...8. 958*a58d3d2aSXin LiVorbis channel order (see below). 959*a58d3d2aSXin Li</t> 960*a58d3d2aSXin Li<t> 961*a58d3d2aSXin LiEach channel is assigned to a speaker location in a conventional surround 962*a58d3d2aSXin Li arrangement. 963*a58d3d2aSXin LiSpecific locations depend on the number of channels, and are given below 964*a58d3d2aSXin Li in order of the corresponding channel indices. 965*a58d3d2aSXin Li<list style="symbols"> 966*a58d3d2aSXin Li <t>1 channel: monophonic (mono).</t> 967*a58d3d2aSXin Li <t>2 channels: stereo (left, right).</t> 968*a58d3d2aSXin Li <t>3 channels: linear surround (left, center, right)</t> 969*a58d3d2aSXin Li <t>4 channels: quadraphonic (front left, front right, rear left, rear right).</t> 970*a58d3d2aSXin Li <t>5 channels: 5.0 surround (front left, front center, front right, rear left, rear right).</t> 971*a58d3d2aSXin Li <t>6 channels: 5.1 surround (front left, front center, front right, rear left, rear right, LFE).</t> 972*a58d3d2aSXin Li <t>7 channels: 6.1 surround (front left, front center, front right, side left, side right, rear center, LFE).</t> 973*a58d3d2aSXin Li <t>8 channels: 7.1 surround (front left, front center, front right, side left, side right, rear left, rear right, LFE)</t> 974*a58d3d2aSXin Li</list> 975*a58d3d2aSXin Li</t> 976*a58d3d2aSXin Li<t> 977*a58d3d2aSXin LiThis set of surround options and speaker location orderings is the same 978*a58d3d2aSXin Li as those used by the Vorbis codec <xref target="vorbis-mapping"/>. 979*a58d3d2aSXin LiThe ordering is different from the one used by the 980*a58d3d2aSXin Li WAVE <xref target="wave-multichannel"/> and 981*a58d3d2aSXin Li Free Lossless Audio Codec (FLAC) <xref target="flac"/> formats, 982*a58d3d2aSXin Li so correct ordering requires permutation of the output channels when decoding 983*a58d3d2aSXin Li to or encoding from those formats. 984*a58d3d2aSXin Li'LFE' here refers to a Low Frequency Effects channel, often mapped to a 985*a58d3d2aSXin Li subwoofer with no particular spatial position. 986*a58d3d2aSXin LiImplementations SHOULD identify 'side' or 'rear' speaker locations with 987*a58d3d2aSXin Li 'surround' and 'back' as appropriate when interfacing with audio formats 988*a58d3d2aSXin Li or systems which prefer that terminology. 989*a58d3d2aSXin Li</t> 990*a58d3d2aSXin Li</section> 991*a58d3d2aSXin Li 992*a58d3d2aSXin Li<section anchor="channel_mapping_255" 993*a58d3d2aSXin Li title="Channel Mapping Family 255"> 994*a58d3d2aSXin Li<t> 995*a58d3d2aSXin LiAllowed numbers of channels: 1...255. 996*a58d3d2aSXin LiNo defined channel meaning. 997*a58d3d2aSXin Li</t> 998*a58d3d2aSXin Li<t> 999*a58d3d2aSXin LiChannels are unidentified. 1000*a58d3d2aSXin LiGeneral-purpose players SHOULD NOT attempt to play these streams. 1001*a58d3d2aSXin LiOffline implementations MAY deinterleave the output into separate PCM files, 1002*a58d3d2aSXin Li one per channel. 1003*a58d3d2aSXin LiImplementations SHOULD NOT produce output for channels mapped to stream index 1004*a58d3d2aSXin Li 255 (pure silence) unless they have no other way to indicate the index of 1005*a58d3d2aSXin Li non-silent channels. 1006*a58d3d2aSXin Li</t> 1007*a58d3d2aSXin Li</section> 1008*a58d3d2aSXin Li 1009*a58d3d2aSXin Li<section anchor="channel_mapping_undefined" 1010*a58d3d2aSXin Li title="Undefined Channel Mappings"> 1011*a58d3d2aSXin Li<t> 1012*a58d3d2aSXin LiThe remaining channel mapping families (2...254) are reserved. 1013*a58d3d2aSXin LiA demuxer implementation encountering a reserved channel mapping family value 1014*a58d3d2aSXin Li SHOULD act as though the value is 255. 1015*a58d3d2aSXin Li</t> 1016*a58d3d2aSXin Li</section> 1017*a58d3d2aSXin Li 1018*a58d3d2aSXin Li<section anchor="downmix" title="Downmixing"> 1019*a58d3d2aSXin Li<t> 1020*a58d3d2aSXin LiAn Ogg Opus player MUST support any valid channel mapping with a channel 1021*a58d3d2aSXin Li mapping family of 0 or 1, even if the number of channels does not match the 1022*a58d3d2aSXin Li physically connected audio hardware. 1023*a58d3d2aSXin LiPlayers SHOULD perform channel mixing to increase or reduce the number of 1024*a58d3d2aSXin Li channels as needed. 1025*a58d3d2aSXin Li</t> 1026*a58d3d2aSXin Li 1027*a58d3d2aSXin Li<t> 1028*a58d3d2aSXin LiImplementations MAY use the matrices in 1029*a58d3d2aSXin Li Figures <xref target="downmix-matrix-3" format="counter"/> 1030*a58d3d2aSXin Li through <xref target="downmix-matrix-8" format="counter"/> to implement 1031*a58d3d2aSXin Li downmixing from multichannel files using 1032*a58d3d2aSXin Li <xref target="channel_mapping_1">Channel Mapping Family 1</xref>, which are 1033*a58d3d2aSXin Li known to give acceptable results for stereo. 1034*a58d3d2aSXin LiMatrices for 3 and 4 channels are normalized so each coefficient row sums 1035*a58d3d2aSXin Li to 1 to avoid clipping. 1036*a58d3d2aSXin LiFor 5 or more channels they are normalized to 2 as a compromise between 1037*a58d3d2aSXin Li clipping and dynamic range reduction. 1038*a58d3d2aSXin Li</t> 1039*a58d3d2aSXin Li<t> 1040*a58d3d2aSXin LiIn these matrices the front left and front right channels are generally 1041*a58d3d2aSXin Lipassed through directly. 1042*a58d3d2aSXin LiWhen a surround channel is split between both the left and right stereo 1043*a58d3d2aSXin Li channels, coefficients are chosen so their squares sum to 1, which 1044*a58d3d2aSXin Li helps preserve the perceived intensity. 1045*a58d3d2aSXin LiRear channels are mixed more diffusely or attenuated to maintain focus 1046*a58d3d2aSXin Li on the front channels. 1047*a58d3d2aSXin Li</t> 1048*a58d3d2aSXin Li 1049*a58d3d2aSXin Li<figure anchor="downmix-matrix-3" 1050*a58d3d2aSXin Li title="Stereo downmix matrix for the linear surround channel mapping" 1051*a58d3d2aSXin Li align="center"> 1052*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1053*a58d3d2aSXin LiL output = ( 0.585786 * left + 0.414214 * center ) 1054*a58d3d2aSXin LiR output = ( 0.414214 * center + 0.585786 * right ) 1055*a58d3d2aSXin Li]]></artwork> 1056*a58d3d2aSXin Li<postamble> 1057*a58d3d2aSXin LiExact coefficient values are 1 and 1/sqrt(2), multiplied by 1058*a58d3d2aSXin Li 1/(1 + 1/sqrt(2)) for normalization. 1059*a58d3d2aSXin Li</postamble> 1060*a58d3d2aSXin Li</figure> 1061*a58d3d2aSXin Li 1062*a58d3d2aSXin Li<figure anchor="downmix-matrix-4" 1063*a58d3d2aSXin Li title="Stereo downmix matrix for the quadraphonic channel mapping" 1064*a58d3d2aSXin Li align="center"> 1065*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1066*a58d3d2aSXin Li/ \ / \ / FL \ 1067*a58d3d2aSXin Li| L output | | 0.422650 0.000000 0.366025 0.211325 | | FR | 1068*a58d3d2aSXin Li| R output | = | 0.000000 0.422650 0.211325 0.366025 | | RL | 1069*a58d3d2aSXin Li\ / \ / \ RR / 1070*a58d3d2aSXin Li]]></artwork> 1071*a58d3d2aSXin Li<postamble> 1072*a58d3d2aSXin LiExact coefficient values are 1, sqrt(3)/2 and 1/2, multiplied by 1073*a58d3d2aSXin Li 1/(1 + sqrt(3)/2 + 1/2) for normalization. 1074*a58d3d2aSXin Li</postamble> 1075*a58d3d2aSXin Li</figure> 1076*a58d3d2aSXin Li 1077*a58d3d2aSXin Li<figure anchor="downmix-matrix-5" 1078*a58d3d2aSXin Li title="Stereo downmix matrix for the 5.0 surround mapping" 1079*a58d3d2aSXin Li align="center"> 1080*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1081*a58d3d2aSXin Li / FL \ 1082*a58d3d2aSXin Li/ \ / \ | FC | 1083*a58d3d2aSXin Li| L | | 0.650802 0.460186 0.000000 0.563611 0.325401 | | FR | 1084*a58d3d2aSXin Li| R | = | 0.000000 0.460186 0.650802 0.325401 0.563611 | | RL | 1085*a58d3d2aSXin Li\ / \ / | RR | 1086*a58d3d2aSXin Li \ / 1087*a58d3d2aSXin Li]]></artwork> 1088*a58d3d2aSXin Li<postamble> 1089*a58d3d2aSXin LiExact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by 1090*a58d3d2aSXin Li 2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2) 1091*a58d3d2aSXin Li for normalization. 1092*a58d3d2aSXin Li</postamble> 1093*a58d3d2aSXin Li</figure> 1094*a58d3d2aSXin Li 1095*a58d3d2aSXin Li<figure anchor="downmix-matrix-6" 1096*a58d3d2aSXin Li title="Stereo downmix matrix for the 5.1 surround mapping" 1097*a58d3d2aSXin Li align="center"> 1098*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1099*a58d3d2aSXin Li /FL \ 1100*a58d3d2aSXin Li/ \ / \ |FC | 1101*a58d3d2aSXin Li|L| | 0.529067 0.374107 0.000000 0.458186 0.264534 0.374107 | |FR | 1102*a58d3d2aSXin Li|R| = | 0.000000 0.374107 0.529067 0.264534 0.458186 0.374107 | |RL | 1103*a58d3d2aSXin Li\ / \ / |RR | 1104*a58d3d2aSXin Li \LFE/ 1105*a58d3d2aSXin Li]]></artwork> 1106*a58d3d2aSXin Li<postamble> 1107*a58d3d2aSXin LiExact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by 1108*a58d3d2aSXin Li2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2 + 1/sqrt(2)) 1109*a58d3d2aSXin Li for normalization. 1110*a58d3d2aSXin Li</postamble> 1111*a58d3d2aSXin Li</figure> 1112*a58d3d2aSXin Li 1113*a58d3d2aSXin Li<figure anchor="downmix-matrix-7" 1114*a58d3d2aSXin Li title="Stereo downmix matrix for the 6.1 surround mapping" 1115*a58d3d2aSXin Li align="center"> 1116*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1117*a58d3d2aSXin Li / \ 1118*a58d3d2aSXin Li | 0.455310 0.321953 0.000000 0.394310 0.227655 0.278819 0.321953 | 1119*a58d3d2aSXin Li | 0.000000 0.321953 0.455310 0.227655 0.394310 0.278819 0.321953 | 1120*a58d3d2aSXin Li \ / 1121*a58d3d2aSXin Li]]></artwork> 1122*a58d3d2aSXin Li<postamble> 1123*a58d3d2aSXin LiExact coefficient values are 1, 1/sqrt(2), sqrt(3)/2, 1/2 and 1124*a58d3d2aSXin Li sqrt(3)/2/sqrt(2), multiplied by 1125*a58d3d2aSXin Li 2/(1 + 1/sqrt(2) + sqrt(3)/2 + 1/2 + 1126*a58d3d2aSXin Li sqrt(3)/2/sqrt(2) + 1/sqrt(2)) for normalization. 1127*a58d3d2aSXin LiThe coefficients are in the same order as in <xref target="channel_mapping_1" />, 1128*a58d3d2aSXin Li and the matrices above. 1129*a58d3d2aSXin Li</postamble> 1130*a58d3d2aSXin Li</figure> 1131*a58d3d2aSXin Li 1132*a58d3d2aSXin Li<figure anchor="downmix-matrix-8" 1133*a58d3d2aSXin Li title="Stereo downmix matrix for the 7.1 surround mapping" 1134*a58d3d2aSXin Li align="center"> 1135*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1136*a58d3d2aSXin Li/ \ 1137*a58d3d2aSXin Li| .388631 .274804 .000000 .336565 .194316 .336565 .194316 .274804 | 1138*a58d3d2aSXin Li| .000000 .274804 .388631 .194316 .336565 .194316 .336565 .274804 | 1139*a58d3d2aSXin Li\ / 1140*a58d3d2aSXin Li]]></artwork> 1141*a58d3d2aSXin Li<postamble> 1142*a58d3d2aSXin LiExact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by 1143*a58d3d2aSXin Li 2/(2 + 2/sqrt(2) + sqrt(3)) for normalization. 1144*a58d3d2aSXin LiThe coefficients are in the same order as in <xref target="channel_mapping_1" />, 1145*a58d3d2aSXin Li and the matrices above. 1146*a58d3d2aSXin Li</postamble> 1147*a58d3d2aSXin Li</figure> 1148*a58d3d2aSXin Li 1149*a58d3d2aSXin Li</section> 1150*a58d3d2aSXin Li 1151*a58d3d2aSXin Li</section> <!-- end channel_mapping_table --> 1152*a58d3d2aSXin Li 1153*a58d3d2aSXin Li</section> <!-- end id_header --> 1154*a58d3d2aSXin Li 1155*a58d3d2aSXin Li<section anchor="comment_header" title="Comment Header"> 1156*a58d3d2aSXin Li 1157*a58d3d2aSXin Li<figure anchor="comment_header_packet" title="Comment Header Packet" 1158*a58d3d2aSXin Li align="center"> 1159*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1160*a58d3d2aSXin Li 0 1 2 3 1161*a58d3d2aSXin Li 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 1162*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1163*a58d3d2aSXin Li| 'O' | 'p' | 'u' | 's' | 1164*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1165*a58d3d2aSXin Li| 'T' | 'a' | 'g' | 's' | 1166*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1167*a58d3d2aSXin Li| Vendor String Length | 1168*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1169*a58d3d2aSXin Li| | 1170*a58d3d2aSXin Li: Vendor String... : 1171*a58d3d2aSXin Li| | 1172*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1173*a58d3d2aSXin Li| User Comment List Length | 1174*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1175*a58d3d2aSXin Li| User Comment #0 String Length | 1176*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1177*a58d3d2aSXin Li| | 1178*a58d3d2aSXin Li: User Comment #0 String... : 1179*a58d3d2aSXin Li| | 1180*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1181*a58d3d2aSXin Li| User Comment #1 String Length | 1182*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+ 1183*a58d3d2aSXin Li: : 1184*a58d3d2aSXin Li]]></artwork> 1185*a58d3d2aSXin Li</figure> 1186*a58d3d2aSXin Li 1187*a58d3d2aSXin Li<t> 1188*a58d3d2aSXin LiThe comment header consists of a 64-bit magic signature, followed by data in 1189*a58d3d2aSXin Li the same format as the <xref target="vorbis-comment"/> header used in Ogg 1190*a58d3d2aSXin Li Vorbis, except (like Ogg Theora and Speex) the final "framing bit" specified 1191*a58d3d2aSXin Li in the Vorbis spec is not present. 1192*a58d3d2aSXin Li<list style="numbers"> 1193*a58d3d2aSXin Li<t>Magic Signature: 1194*a58d3d2aSXin Li<vspace blankLines="1"/> 1195*a58d3d2aSXin LiThis is an 8-octet (64-bit) field that allows codec identification and is 1196*a58d3d2aSXin Li human-readable. 1197*a58d3d2aSXin LiIt contains, in order, the magic numbers: 1198*a58d3d2aSXin Li<list style="empty"> 1199*a58d3d2aSXin Li<t>0x4F 'O'</t> 1200*a58d3d2aSXin Li<t>0x70 'p'</t> 1201*a58d3d2aSXin Li<t>0x75 'u'</t> 1202*a58d3d2aSXin Li<t>0x73 's'</t> 1203*a58d3d2aSXin Li<t>0x54 'T'</t> 1204*a58d3d2aSXin Li<t>0x61 'a'</t> 1205*a58d3d2aSXin Li<t>0x67 'g'</t> 1206*a58d3d2aSXin Li<t>0x73 's'</t> 1207*a58d3d2aSXin Li</list> 1208*a58d3d2aSXin LiStarting with "Op" helps distinguish it from audio data packets, as this is an 1209*a58d3d2aSXin Li invalid TOC sequence. 1210*a58d3d2aSXin Li<vspace blankLines="1"/> 1211*a58d3d2aSXin Li</t> 1212*a58d3d2aSXin Li<t>Vendor String Length (32 bits, unsigned, little endian): 1213*a58d3d2aSXin Li<vspace blankLines="1"/> 1214*a58d3d2aSXin LiThis field gives the length of the following vendor string, in octets. 1215*a58d3d2aSXin LiIt MUST NOT indicate that the vendor string is longer than the rest of the 1216*a58d3d2aSXin Li packet. 1217*a58d3d2aSXin Li<vspace blankLines="1"/> 1218*a58d3d2aSXin Li</t> 1219*a58d3d2aSXin Li<t>Vendor String (variable length, UTF-8 vector): 1220*a58d3d2aSXin Li<vspace blankLines="1"/> 1221*a58d3d2aSXin LiThis is a simple human-readable tag for vendor information, encoded as a UTF-8 1222*a58d3d2aSXin Li string <xref target="RFC3629"/>. 1223*a58d3d2aSXin LiNo terminating null octet is necessary. 1224*a58d3d2aSXin Li<vspace blankLines="1"/> 1225*a58d3d2aSXin LiThis tag is intended to identify the codec encoder and encapsulation 1226*a58d3d2aSXin Li implementations, for tracing differences in technical behavior. 1227*a58d3d2aSXin LiUser-facing applications can use the 'ENCODER' user comment tag to identify 1228*a58d3d2aSXin Li themselves. 1229*a58d3d2aSXin Li<vspace blankLines="1"/> 1230*a58d3d2aSXin Li</t> 1231*a58d3d2aSXin Li<t>User Comment List Length (32 bits, unsigned, little endian): 1232*a58d3d2aSXin Li<vspace blankLines="1"/> 1233*a58d3d2aSXin LiThis field indicates the number of user-supplied comments. 1234*a58d3d2aSXin LiIt MAY indicate there are zero user-supplied comments, in which case there are 1235*a58d3d2aSXin Li no additional fields in the packet. 1236*a58d3d2aSXin LiIt MUST NOT indicate that there are so many comments that the comment string 1237*a58d3d2aSXin Li lengths would require more data than is available in the rest of the packet. 1238*a58d3d2aSXin Li<vspace blankLines="1"/> 1239*a58d3d2aSXin Li</t> 1240*a58d3d2aSXin Li<t>User Comment #i String Length (32 bits, unsigned, little endian): 1241*a58d3d2aSXin Li<vspace blankLines="1"/> 1242*a58d3d2aSXin LiThis field gives the length of the following user comment string, in octets. 1243*a58d3d2aSXin LiThere is one for each user comment indicated by the 'user comment list length' 1244*a58d3d2aSXin Li field. 1245*a58d3d2aSXin LiIt MUST NOT indicate that the string is longer than the rest of the packet. 1246*a58d3d2aSXin Li<vspace blankLines="1"/> 1247*a58d3d2aSXin Li</t> 1248*a58d3d2aSXin Li<t>User Comment #i String (variable length, UTF-8 vector): 1249*a58d3d2aSXin Li<vspace blankLines="1"/> 1250*a58d3d2aSXin LiThis field contains a single user comment encoded as a UTF-8 1251*a58d3d2aSXin Li string <xref target="RFC3629"/>. 1252*a58d3d2aSXin LiThere is one for each user comment indicated by the 'user comment list length' 1253*a58d3d2aSXin Li field. 1254*a58d3d2aSXin Li</t> 1255*a58d3d2aSXin Li</list> 1256*a58d3d2aSXin Li</t> 1257*a58d3d2aSXin Li 1258*a58d3d2aSXin Li<t> 1259*a58d3d2aSXin LiThe vendor string length and user comment list length are REQUIRED, and 1260*a58d3d2aSXin Li implementations SHOULD treat a stream as invalid if it contains a comment 1261*a58d3d2aSXin Li header that does not have enough data for these fields, or that does not 1262*a58d3d2aSXin Li contain enough data for the corresponding vendor string or user comments they 1263*a58d3d2aSXin Li describe. 1264*a58d3d2aSXin LiMaking this check before allocating the associated memory to contain the data 1265*a58d3d2aSXin Li helps prevent a possible Denial-of-Service (DoS) attack from small comment 1266*a58d3d2aSXin Li headers that claim to contain strings longer than the entire packet or more 1267*a58d3d2aSXin Li user comments than than could possibly fit in the packet. 1268*a58d3d2aSXin Li</t> 1269*a58d3d2aSXin Li 1270*a58d3d2aSXin Li<t> 1271*a58d3d2aSXin LiImmediately following the user comment list, the comment header MAY 1272*a58d3d2aSXin Li contain zero-padding or other binary data which is not specified here. 1273*a58d3d2aSXin LiIf the least-significant bit of the first byte of this data is 1, then editors 1274*a58d3d2aSXin Li SHOULD preserve the contents of this data when updating the tags, but if this 1275*a58d3d2aSXin Li bit is 0, all such data MAY be treated as padding, and truncated or discarded 1276*a58d3d2aSXin Li as desired. 1277*a58d3d2aSXin LiThis allows informal experimentation with the format of this binary data until 1278*a58d3d2aSXin Li it can be specified later. 1279*a58d3d2aSXin Li</t> 1280*a58d3d2aSXin Li 1281*a58d3d2aSXin Li<t> 1282*a58d3d2aSXin LiThe comment header can be arbitrarily large and might be spread over a large 1283*a58d3d2aSXin Li number of Ogg pages. 1284*a58d3d2aSXin LiImplementations MUST avoid attempting to allocate excessive amounts of memory 1285*a58d3d2aSXin Li when presented with a very large comment header. 1286*a58d3d2aSXin LiTo accomplish this, implementations MAY treat a stream as invalid if it has a 1287*a58d3d2aSXin Li comment header larger than 125,829,120 octets (120 MB), and MAY 1288*a58d3d2aSXin Li ignore individual comments that are not fully contained within the first 1289*a58d3d2aSXin Li 61,440 octets of the comment header. 1290*a58d3d2aSXin Li</t> 1291*a58d3d2aSXin Li 1292*a58d3d2aSXin Li<section anchor="comment_format" title="Tag Definitions"> 1293*a58d3d2aSXin Li<t> 1294*a58d3d2aSXin LiThe user comment strings follow the NAME=value format described by 1295*a58d3d2aSXin Li <xref target="vorbis-comment"/> with the same recommended tag names: 1296*a58d3d2aSXin Li ARTIST, TITLE, DATE, ALBUM, and so on. 1297*a58d3d2aSXin Li</t> 1298*a58d3d2aSXin Li<t> 1299*a58d3d2aSXin LiTwo new comment tags are introduced here: 1300*a58d3d2aSXin Li</t> 1301*a58d3d2aSXin Li 1302*a58d3d2aSXin Li<t>First, an optional gain for track normalization:</t> 1303*a58d3d2aSXin Li<figure align="center"> 1304*a58d3d2aSXin Li<artwork align="left"><![CDATA[ 1305*a58d3d2aSXin LiR128_TRACK_GAIN=-573 1306*a58d3d2aSXin Li]]></artwork> 1307*a58d3d2aSXin Li</figure> 1308*a58d3d2aSXin Li<t> 1309*a58d3d2aSXin Li representing the volume shift needed to normalize the track's volume 1310*a58d3d2aSXin Li during isolated playback, in random shuffle, and so on. 1311*a58d3d2aSXin LiThe gain is a Q7.8 fixed point number in dB, as in the ID header's 'output 1312*a58d3d2aSXin Li gain' field. 1313*a58d3d2aSXin LiThis tag is similar to the REPLAYGAIN_TRACK_GAIN tag in 1314*a58d3d2aSXin Li Vorbis <xref target="replay-gain"/>, except that the normal volume 1315*a58d3d2aSXin Li reference is the <xref target="EBU-R128"/> standard. 1316*a58d3d2aSXin Li</t> 1317*a58d3d2aSXin Li<t>Second, an optional gain for album normalization:</t> 1318*a58d3d2aSXin Li<figure align="center"> 1319*a58d3d2aSXin Li<artwork align="left"><![CDATA[ 1320*a58d3d2aSXin LiR128_ALBUM_GAIN=111 1321*a58d3d2aSXin Li]]></artwork> 1322*a58d3d2aSXin Li</figure> 1323*a58d3d2aSXin Li<t> 1324*a58d3d2aSXin Li representing the volume shift needed to normalize the overall volume when 1325*a58d3d2aSXin Li played as part of a particular collection of tracks. 1326*a58d3d2aSXin LiThe gain is also a Q7.8 fixed point number in dB, as in the ID header's 1327*a58d3d2aSXin Li 'output gain' field. 1328*a58d3d2aSXin LiThe values '-573' and '111' given here are just examples. 1329*a58d3d2aSXin Li</t> 1330*a58d3d2aSXin Li<t> 1331*a58d3d2aSXin LiAn Ogg Opus stream MUST NOT have more than one of each of these tags, and if 1332*a58d3d2aSXin Li present their values MUST be an integer from -32768 to 32767, inclusive, 1333*a58d3d2aSXin Li represented in ASCII as a base 10 number with no whitespace. 1334*a58d3d2aSXin LiA leading '+' or '-' character is valid. 1335*a58d3d2aSXin LiLeading zeros are also permitted, but the value MUST be represented by 1336*a58d3d2aSXin Li no more than 6 characters. 1337*a58d3d2aSXin LiOther non-digit characters MUST NOT be present. 1338*a58d3d2aSXin Li</t> 1339*a58d3d2aSXin Li<t> 1340*a58d3d2aSXin LiIf present, R128_TRACK_GAIN and R128_ALBUM_GAIN MUST correctly represent 1341*a58d3d2aSXin Li the R128 normalization gain relative to the 'output gain' field specified 1342*a58d3d2aSXin Li in the ID header. 1343*a58d3d2aSXin LiIf a player chooses to make use of the R128_TRACK_GAIN tag or the 1344*a58d3d2aSXin Li R128_ALBUM_GAIN tag, it MUST apply those gains 1345*a58d3d2aSXin Li <spanx style="emph">in addition</spanx> to the 'output gain' value. 1346*a58d3d2aSXin LiIf a tool modifies the ID header's 'output gain' field, it MUST also update or 1347*a58d3d2aSXin Li remove the R128_TRACK_GAIN and R128_ALBUM_GAIN comment tags if present. 1348*a58d3d2aSXin LiA muxer SHOULD place the gain it wants other tools to use by default into the 1349*a58d3d2aSXin Li 'output gain' field, and not the comment tag. 1350*a58d3d2aSXin Li</t> 1351*a58d3d2aSXin Li<t> 1352*a58d3d2aSXin LiTo avoid confusion with multiple normalization schemes, an Opus comment header 1353*a58d3d2aSXin Li SHOULD NOT contain any of the REPLAYGAIN_TRACK_GAIN, REPLAYGAIN_TRACK_PEAK, 1354*a58d3d2aSXin Li REPLAYGAIN_ALBUM_GAIN, or REPLAYGAIN_ALBUM_PEAK tags, unless they are only 1355*a58d3d2aSXin Li to be used in some context where there is guaranteed to be no such confusion. 1356*a58d3d2aSXin Li<xref target="EBU-R128"/> normalization is preferred to the earlier 1357*a58d3d2aSXin Li REPLAYGAIN schemes because of its clear definition and adoption by industry. 1358*a58d3d2aSXin LiPeak normalizations are difficult to calculate reliably for lossy codecs 1359*a58d3d2aSXin Li because of variation in excursion heights due to decoder differences. 1360*a58d3d2aSXin LiIn the authors' investigations they were not applied consistently or broadly 1361*a58d3d2aSXin Li enough to merit inclusion here. 1362*a58d3d2aSXin Li</t> 1363*a58d3d2aSXin Li</section> <!-- end comment_format --> 1364*a58d3d2aSXin Li</section> <!-- end comment_header --> 1365*a58d3d2aSXin Li 1366*a58d3d2aSXin Li</section> <!-- end headers --> 1367*a58d3d2aSXin Li 1368*a58d3d2aSXin Li<section anchor="packet_size_limits" title="Packet Size Limits"> 1369*a58d3d2aSXin Li<t> 1370*a58d3d2aSXin LiTechnically, valid Opus packets can be arbitrarily large due to the padding 1371*a58d3d2aSXin Li format, although the amount of non-padding data they can contain is bounded. 1372*a58d3d2aSXin LiThese packets might be spread over a similarly enormous number of Ogg pages. 1373*a58d3d2aSXin LiWhen encoding, implementations SHOULD limit the use of padding in audio data 1374*a58d3d2aSXin Li packets to no more than is necessary to make a variable bitrate (VBR) stream 1375*a58d3d2aSXin Li constant bitrate (CBR), unless they have no reasonable way to determine what 1376*a58d3d2aSXin Li is necessary. 1377*a58d3d2aSXin LiDemuxers SHOULD treat audio data packets as invalid (treat them as if they were 1378*a58d3d2aSXin Li malformed Opus packets with an invalid TOC sequence) if they are larger than 1379*a58d3d2aSXin Li 61,440 octets per Opus stream, unless they have a specific reason for 1380*a58d3d2aSXin Li allowing extra padding. 1381*a58d3d2aSXin LiSuch packets necessarily contain more padding than needed to make a stream CBR. 1382*a58d3d2aSXin LiDemuxers MUST avoid attempting to allocate excessive amounts of memory when 1383*a58d3d2aSXin Li presented with a very large packet. 1384*a58d3d2aSXin LiDemuxers MAY treat audio data packets as invalid or partially process them if 1385*a58d3d2aSXin Li they are larger than 61,440 octets in an Ogg Opus stream with channel 1386*a58d3d2aSXin Li mapping families 0 or 1. 1387*a58d3d2aSXin LiDemuxers MAY treat audio data packets as invalid or partially process them in 1388*a58d3d2aSXin Li any Ogg Opus stream if the packet is larger than 61,440 octets and also 1389*a58d3d2aSXin Li larger than 7,680 octets per Opus stream. 1390*a58d3d2aSXin LiThe presence of an extremely large packet in the stream could indicate a 1391*a58d3d2aSXin Li memory exhaustion attack or stream corruption. 1392*a58d3d2aSXin Li</t> 1393*a58d3d2aSXin Li<t> 1394*a58d3d2aSXin LiIn an Ogg Opus stream, the largest possible valid packet that does not use 1395*a58d3d2aSXin Li padding has a size of (61,298*N - 2) octets. 1396*a58d3d2aSXin LiWith 255 streams, this is 15,630,988 octets and can 1397*a58d3d2aSXin Li span up to 61,298 Ogg pages, all but one of which will have a granule 1398*a58d3d2aSXin Li position of -1. 1399*a58d3d2aSXin LiThis is of course a very extreme packet, consisting of 255 streams, each 1400*a58d3d2aSXin Li containing 120 ms of audio encoded as 2.5 ms frames, each frame 1401*a58d3d2aSXin Li using the maximum possible number of octets (1275) and stored in the least 1402*a58d3d2aSXin Li efficient manner allowed (a VBR code 3 Opus packet). 1403*a58d3d2aSXin LiEven in such a packet, most of the data will be zeros as 2.5 ms frames 1404*a58d3d2aSXin Li cannot actually use all 1275 octets. 1405*a58d3d2aSXin Li</t> 1406*a58d3d2aSXin Li<t> 1407*a58d3d2aSXin LiThe largest packet consisting of entirely useful data is 1408*a58d3d2aSXin Li (15,326*N - 2) octets. 1409*a58d3d2aSXin LiThis corresponds to 120 ms of audio encoded as 10 ms frames in either 1410*a58d3d2aSXin Li SILK or Hybrid mode, but at a data rate of over 1 Mbps, which makes little 1411*a58d3d2aSXin Li sense for the quality achieved. 1412*a58d3d2aSXin Li</t> 1413*a58d3d2aSXin Li<t> 1414*a58d3d2aSXin LiA more reasonable limit is (7,664*N - 2) octets. 1415*a58d3d2aSXin LiThis corresponds to 120 ms of audio encoded as 20 ms stereo CELT mode 1416*a58d3d2aSXin Li frames, with a total bitrate just under 511 kbps (not counting the Ogg 1417*a58d3d2aSXin Li encapsulation overhead). 1418*a58d3d2aSXin LiFor channel mapping family 1, N=8 provides a reasonable upper bound, as it 1419*a58d3d2aSXin Li allows for each of the 8 possible output channels to be decoded from a 1420*a58d3d2aSXin Li separate stereo Opus stream. 1421*a58d3d2aSXin LiThis gives a size of 61,310 octets, which is rounded up to a multiple of 1422*a58d3d2aSXin Li 1,024 octets to yield the audio data packet size of 61,440 octets 1423*a58d3d2aSXin Li that any implementation is expected to be able to process successfully. 1424*a58d3d2aSXin Li</t> 1425*a58d3d2aSXin Li</section> 1426*a58d3d2aSXin Li 1427*a58d3d2aSXin Li<section anchor="encoder" title="Encoder Guidelines"> 1428*a58d3d2aSXin Li<t> 1429*a58d3d2aSXin LiWhen encoding Opus streams, Ogg muxers SHOULD take into account the 1430*a58d3d2aSXin Li algorithmic delay of the Opus encoder. 1431*a58d3d2aSXin Li</t> 1432*a58d3d2aSXin Li<t> 1433*a58d3d2aSXin LiIn encoders derived from the reference 1434*a58d3d2aSXin Li implementation <xref target="RFC6716"/>, the number of samples can be 1435*a58d3d2aSXin Li queried with: 1436*a58d3d2aSXin Li</t> 1437*a58d3d2aSXin Li<figure align="center"> 1438*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1439*a58d3d2aSXin Li opus_encoder_ctl(encoder_state, OPUS_GET_LOOKAHEAD(&delay_samples)); 1440*a58d3d2aSXin Li]]></artwork> 1441*a58d3d2aSXin Li</figure> 1442*a58d3d2aSXin Li<t> 1443*a58d3d2aSXin LiTo achieve good quality in the very first samples of a stream, implementations 1444*a58d3d2aSXin Li MAY use linear predictive coding (LPC) extrapolation to generate at least 120 1445*a58d3d2aSXin Li extra samples at the beginning to avoid the Opus encoder having to encode a 1446*a58d3d2aSXin Li discontinuous signal. 1447*a58d3d2aSXin LiFor more information on linear prediction, see 1448*a58d3d2aSXin Li <xref target="linear-prediction"/>. 1449*a58d3d2aSXin LiFor an input file containing 'length' samples, the implementation SHOULD set 1450*a58d3d2aSXin Li the pre-skip header value to (delay_samples + extra_samples), encode 1451*a58d3d2aSXin Li at least (length + delay_samples + extra_samples) 1452*a58d3d2aSXin Li samples, and set the granule position of the last page to 1453*a58d3d2aSXin Li (length + delay_samples + extra_samples). 1454*a58d3d2aSXin LiThis ensures that the encoded file has the same duration as the original, with 1455*a58d3d2aSXin Li no time offset. The best way to pad the end of the stream is to also use LPC 1456*a58d3d2aSXin Li extrapolation, but zero-padding is also acceptable. 1457*a58d3d2aSXin Li</t> 1458*a58d3d2aSXin Li 1459*a58d3d2aSXin Li<section anchor="lpc" title="LPC Extrapolation"> 1460*a58d3d2aSXin Li<t> 1461*a58d3d2aSXin LiThe first step in LPC extrapolation is to compute linear prediction 1462*a58d3d2aSXin Li coefficients. <xref target="lpc-sample"/> 1463*a58d3d2aSXin LiWhen extending the end of the signal, order-N (typically with N ranging from 8 1464*a58d3d2aSXin Li to 40) LPC analysis is performed on a window near the end of the signal. 1465*a58d3d2aSXin LiThe last N samples are used as memory to an infinite impulse response (IIR) 1466*a58d3d2aSXin Li filter. 1467*a58d3d2aSXin Li</t> 1468*a58d3d2aSXin Li<t> 1469*a58d3d2aSXin LiThe filter is then applied on a zero input to extrapolate the end of the signal. 1470*a58d3d2aSXin LiLet a(k) be the kth LPC coefficient and x(n) be the nth sample of the signal, 1471*a58d3d2aSXin Li each new sample past the end of the signal is computed as: 1472*a58d3d2aSXin Li</t> 1473*a58d3d2aSXin Li<figure align="center"> 1474*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1475*a58d3d2aSXin Li N 1476*a58d3d2aSXin Li --- 1477*a58d3d2aSXin Lix(n) = \ a(k)*x(n-k) 1478*a58d3d2aSXin Li / 1479*a58d3d2aSXin Li --- 1480*a58d3d2aSXin Li k=1 1481*a58d3d2aSXin Li]]></artwork> 1482*a58d3d2aSXin Li</figure> 1483*a58d3d2aSXin Li<t> 1484*a58d3d2aSXin LiThe process is repeated independently for each channel. 1485*a58d3d2aSXin LiIt is possible to extend the beginning of the signal by applying the same 1486*a58d3d2aSXin Li process backward in time. 1487*a58d3d2aSXin LiWhen extending the beginning of the signal, it is best to apply a "fade in" to 1488*a58d3d2aSXin Li the extrapolated signal, e.g. by multiplying it by a half-Hanning window 1489*a58d3d2aSXin Li <xref target="hanning"/>. 1490*a58d3d2aSXin Li</t> 1491*a58d3d2aSXin Li 1492*a58d3d2aSXin Li</section> 1493*a58d3d2aSXin Li 1494*a58d3d2aSXin Li<section anchor="continuous_chaining" title="Continuous Chaining"> 1495*a58d3d2aSXin Li<t> 1496*a58d3d2aSXin LiIn some applications, such as Internet radio, it is desirable to cut a long 1497*a58d3d2aSXin Li stream into smaller chains, e.g. so the comment header can be updated. 1498*a58d3d2aSXin LiThis can be done simply by separating the input streams into segments and 1499*a58d3d2aSXin Li encoding each segment independently. 1500*a58d3d2aSXin LiThe drawback of this approach is that it creates a small discontinuity 1501*a58d3d2aSXin Li at the boundary due to the lossy nature of Opus. 1502*a58d3d2aSXin LiA muxer MAY avoid this discontinuity by using the following procedure: 1503*a58d3d2aSXin Li<list style="numbers"> 1504*a58d3d2aSXin Li<t>Encode the last frame of the first segment as an independent frame by 1505*a58d3d2aSXin Li turning off all forms of inter-frame prediction. 1506*a58d3d2aSXin LiDe-emphasis is allowed.</t> 1507*a58d3d2aSXin Li<t>Set the granule position of the last page to a point near the end of the 1508*a58d3d2aSXin Li last frame.</t> 1509*a58d3d2aSXin Li<t>Begin the second segment with a copy of the last frame of the first 1510*a58d3d2aSXin Li segment.</t> 1511*a58d3d2aSXin Li<t>Set the pre-skip value of the second stream in such a way as to properly 1512*a58d3d2aSXin Li join the two streams.</t> 1513*a58d3d2aSXin Li<t>Continue the encoding process normally from there, without any reset to 1514*a58d3d2aSXin Li the encoder.</t> 1515*a58d3d2aSXin Li</list> 1516*a58d3d2aSXin Li</t> 1517*a58d3d2aSXin Li<t> 1518*a58d3d2aSXin LiIn encoders derived from the reference implementation, inter-frame prediction 1519*a58d3d2aSXin Li can be turned off by calling: 1520*a58d3d2aSXin Li</t> 1521*a58d3d2aSXin Li<figure align="center"> 1522*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1523*a58d3d2aSXin Li opus_encoder_ctl(encoder_state, OPUS_SET_PREDICTION_DISABLED(1)); 1524*a58d3d2aSXin Li]]></artwork> 1525*a58d3d2aSXin Li</figure> 1526*a58d3d2aSXin Li<t> 1527*a58d3d2aSXin LiFor best results, this implementation requires that prediction be explicitly 1528*a58d3d2aSXin Li enabled again before resuming normal encoding, even after a reset. 1529*a58d3d2aSXin Li</t> 1530*a58d3d2aSXin Li 1531*a58d3d2aSXin Li</section> 1532*a58d3d2aSXin Li 1533*a58d3d2aSXin Li</section> 1534*a58d3d2aSXin Li 1535*a58d3d2aSXin Li<section anchor="implementation" title="Implementation Status"> 1536*a58d3d2aSXin Li<t> 1537*a58d3d2aSXin LiA brief summary of major implementations of this draft is available 1538*a58d3d2aSXin Li at <eref target="https://wiki.xiph.org/OggOpusImplementation"/>, 1539*a58d3d2aSXin Li along with their status. 1540*a58d3d2aSXin Li</t> 1541*a58d3d2aSXin Li<t> 1542*a58d3d2aSXin Li[Note to RFC Editor: please remove this entire section before 1543*a58d3d2aSXin Li final publication per <xref target="RFC6982"/>, along with 1544*a58d3d2aSXin Li its references.] 1545*a58d3d2aSXin Li</t> 1546*a58d3d2aSXin Li</section> 1547*a58d3d2aSXin Li 1548*a58d3d2aSXin Li<section anchor="security" title="Security Considerations"> 1549*a58d3d2aSXin Li<t> 1550*a58d3d2aSXin LiImplementations of the Opus codec need to take appropriate security 1551*a58d3d2aSXin Li considerations into account, as outlined in <xref target="RFC4732"/>. 1552*a58d3d2aSXin LiThis is just as much a problem for the container as it is for the codec itself. 1553*a58d3d2aSXin LiMalicious payloads and/or input streams can be used to attack codec 1554*a58d3d2aSXin Li implementations. 1555*a58d3d2aSXin LiImplementations MUST NOT overrun their allocated memory nor consume excessive 1556*a58d3d2aSXin Li resources when decoding payloads or processing input streams. 1557*a58d3d2aSXin LiAlthough problems in encoding applications are typically rarer, this still 1558*a58d3d2aSXin Li applies to a muxer, as vulnerabilities would allow an attacker to attack 1559*a58d3d2aSXin Li transcoding gateways. 1560*a58d3d2aSXin Li</t> 1561*a58d3d2aSXin Li 1562*a58d3d2aSXin Li<t> 1563*a58d3d2aSXin LiHeader parsing code contains the most likely area for potential overruns. 1564*a58d3d2aSXin LiIt is important for implementations to ensure their buffers contain enough 1565*a58d3d2aSXin Li data for all of the required fields before attempting to read it (for example, 1566*a58d3d2aSXin Li for all of the channel map data in the ID header). 1567*a58d3d2aSXin LiImplementations would do well to validate the indices of the channel map, also, 1568*a58d3d2aSXin Li to ensure they meet all of the restrictions outlined in 1569*a58d3d2aSXin Li <xref target="channel_mapping"/>, in order to avoid attempting to read data 1570*a58d3d2aSXin Li from channels that do not exist. 1571*a58d3d2aSXin Li</t> 1572*a58d3d2aSXin Li 1573*a58d3d2aSXin Li<t> 1574*a58d3d2aSXin LiTo avoid excessive resource usage, we advise implementations to be especially 1575*a58d3d2aSXin Li wary of streams that might cause them to process far more data than was 1576*a58d3d2aSXin Li actually transmitted. 1577*a58d3d2aSXin LiFor example, a relatively small comment header may contain values for the 1578*a58d3d2aSXin Li string lengths or user comment list length that imply that it is many 1579*a58d3d2aSXin Li gigabytes in size. 1580*a58d3d2aSXin LiEven computing the size of the required buffer could overflow a 32-bit integer, 1581*a58d3d2aSXin Li and actually attempting to allocate such a buffer before verifying it would be 1582*a58d3d2aSXin Li a reasonable size is a bad idea. 1583*a58d3d2aSXin LiAfter reading the user comment list length, implementations might wish to 1584*a58d3d2aSXin Li verify that the header contains at least the minimum amount of data for that 1585*a58d3d2aSXin Li many comments (4 additional octets per comment, to indicate each has a 1586*a58d3d2aSXin Li length of zero) before proceeding any further, again taking care to avoid 1587*a58d3d2aSXin Li overflow in these calculations. 1588*a58d3d2aSXin LiIf allocating an array of pointers to point at these strings, the size of the 1589*a58d3d2aSXin Li pointers may be larger than 4 octets, potentially requiring a separate 1590*a58d3d2aSXin Li overflow check. 1591*a58d3d2aSXin Li</t> 1592*a58d3d2aSXin Li 1593*a58d3d2aSXin Li<t> 1594*a58d3d2aSXin LiAnother bug in this class we have observed more than once involves the handling 1595*a58d3d2aSXin Li of invalid data at the end of a stream. 1596*a58d3d2aSXin LiOften, implementations will seek to the end of a stream to locate the last 1597*a58d3d2aSXin Li timestamp in order to compute its total duration. 1598*a58d3d2aSXin LiIf they do not find a valid capture pattern and Ogg page from the desired 1599*a58d3d2aSXin Li logical stream, they will back up and try again. 1600*a58d3d2aSXin LiIf care is not taken to avoid re-scanning data that was already scanned, this 1601*a58d3d2aSXin Li search can quickly devolve into something with a complexity that is quadratic 1602*a58d3d2aSXin Li in the amount of invalid data. 1603*a58d3d2aSXin Li</t> 1604*a58d3d2aSXin Li 1605*a58d3d2aSXin Li<t> 1606*a58d3d2aSXin LiIn general when seeking, implementations will wish to be cautious about the 1607*a58d3d2aSXin Li effects of invalid granule position values, and ensure all algorithms will 1608*a58d3d2aSXin Li continue to make progress and eventually terminate, even if these are missing 1609*a58d3d2aSXin Li or out-of-order. 1610*a58d3d2aSXin Li</t> 1611*a58d3d2aSXin Li 1612*a58d3d2aSXin Li<t> 1613*a58d3d2aSXin LiLike most other container formats, Ogg Opus streams SHOULD NOT be used with 1614*a58d3d2aSXin Li insecure ciphers or cipher modes that are vulnerable to known-plaintext 1615*a58d3d2aSXin Li attacks. 1616*a58d3d2aSXin LiElements such as the Ogg page capture pattern and the magic signatures in the 1617*a58d3d2aSXin Li ID header and the comment header all have easily predictable values, in 1618*a58d3d2aSXin Li addition to various elements of the codec data itself. 1619*a58d3d2aSXin Li</t> 1620*a58d3d2aSXin Li</section> 1621*a58d3d2aSXin Li 1622*a58d3d2aSXin Li<section anchor="content_type" title="Content Type"> 1623*a58d3d2aSXin Li<t> 1624*a58d3d2aSXin LiAn "Ogg Opus file" consists of one or more sequentially multiplexed segments, 1625*a58d3d2aSXin Li each containing exactly one Ogg Opus stream. 1626*a58d3d2aSXin LiThe RECOMMENDED mime-type for Ogg Opus files is "audio/ogg". 1627*a58d3d2aSXin Li</t> 1628*a58d3d2aSXin Li 1629*a58d3d2aSXin Li<t> 1630*a58d3d2aSXin LiIf more specificity is desired, one MAY indicate the presence of Opus streams 1631*a58d3d2aSXin Li using the codecs parameter defined in <xref target="RFC6381"/> and 1632*a58d3d2aSXin Li <xref target="RFC5334"/>, e.g., 1633*a58d3d2aSXin Li</t> 1634*a58d3d2aSXin Li<figure> 1635*a58d3d2aSXin Li<artwork align="center"><![CDATA[ 1636*a58d3d2aSXin Li audio/ogg; codecs=opus 1637*a58d3d2aSXin Li]]></artwork> 1638*a58d3d2aSXin Li</figure> 1639*a58d3d2aSXin Li<t> 1640*a58d3d2aSXin Li for an Ogg Opus file. 1641*a58d3d2aSXin Li</t> 1642*a58d3d2aSXin Li 1643*a58d3d2aSXin Li<t> 1644*a58d3d2aSXin LiThe RECOMMENDED filename extension for Ogg Opus files is '.opus'. 1645*a58d3d2aSXin Li</t> 1646*a58d3d2aSXin Li 1647*a58d3d2aSXin Li<t> 1648*a58d3d2aSXin LiWhen Opus is concurrently multiplexed with other streams in an Ogg container, 1649*a58d3d2aSXin Li one SHOULD use one of the "audio/ogg", "video/ogg", or "application/ogg" 1650*a58d3d2aSXin Li mime-types, as defined in <xref target="RFC5334"/>. 1651*a58d3d2aSXin LiSuch streams are not strictly "Ogg Opus files" as described above, 1652*a58d3d2aSXin Li since they contain more than a single Opus stream per sequentially 1653*a58d3d2aSXin Li multiplexed segment, e.g. video or multiple audio tracks. 1654*a58d3d2aSXin LiIn such cases the the '.opus' filename extension is NOT RECOMMENDED. 1655*a58d3d2aSXin Li</t> 1656*a58d3d2aSXin Li 1657*a58d3d2aSXin Li<t> 1658*a58d3d2aSXin LiIn either case, this document updates <xref target="RFC5334"/> 1659*a58d3d2aSXin Li to add 'opus' as a codecs parameter value with char[8]: 'OpusHead' 1660*a58d3d2aSXin Li as Codec Identifier. 1661*a58d3d2aSXin Li</t> 1662*a58d3d2aSXin Li</section> 1663*a58d3d2aSXin Li 1664*a58d3d2aSXin Li<section anchor="iana" title="IANA Considerations"> 1665*a58d3d2aSXin Li<t> 1666*a58d3d2aSXin LiThis document updates the IANA Media Types registry to add .opus 1667*a58d3d2aSXin Li as a file extension for "audio/ogg", and to add itself as a reference 1668*a58d3d2aSXin Li alongside <xref target="RFC5334"/> for "audio/ogg", "video/ogg", and 1669*a58d3d2aSXin Li "application/ogg" Media Types. 1670*a58d3d2aSXin Li</t> 1671*a58d3d2aSXin Li<t> 1672*a58d3d2aSXin LiThis document defines a new registry "Opus Channel Mapping Families" to 1673*a58d3d2aSXin Li indicate how the semantic meanings of the channels in a multi-channel Opus 1674*a58d3d2aSXin Li stream are described. 1675*a58d3d2aSXin LiIANA is requested to create a new name space of "Opus Channel Mapping 1676*a58d3d2aSXin Li Families". 1677*a58d3d2aSXin LiThis will be a new registry on the IANA Matrix, and not a subregistry of an 1678*a58d3d2aSXin Li existing registry. 1679*a58d3d2aSXin LiModifications to this registry follow the "Specification Required" registration 1680*a58d3d2aSXin Li policy as defined in <xref target="RFC5226"/>. 1681*a58d3d2aSXin LiEach registry entry consists of a Channel Mapping Family Number, which is 1682*a58d3d2aSXin Li specified in decimal in the range 0 to 255, inclusive, and a Reference (or 1683*a58d3d2aSXin Li list of references) 1684*a58d3d2aSXin LiEach Reference must point to sufficient documentation to describe what 1685*a58d3d2aSXin Li information is coded in the Opus identification header for this channel 1686*a58d3d2aSXin Li mapping family, how a demuxer determines the Stream Count ('N') and Coupled 1687*a58d3d2aSXin Li Stream Count ('M') from this information, and how it determines the proper 1688*a58d3d2aSXin Li interpretation of each of the decoded channels. 1689*a58d3d2aSXin Li</t> 1690*a58d3d2aSXin Li<t> 1691*a58d3d2aSXin LiThis document defines three initial assignments for this registry. 1692*a58d3d2aSXin Li</t> 1693*a58d3d2aSXin Li<texttable> 1694*a58d3d2aSXin Li<ttcol>Value</ttcol><ttcol>Reference</ttcol> 1695*a58d3d2aSXin Li<c>0</c><c>[RFCXXXX] <xref target="channel_mapping_0"/></c> 1696*a58d3d2aSXin Li<c>1</c><c>[RFCXXXX] <xref target="channel_mapping_1"/></c> 1697*a58d3d2aSXin Li<c>255</c><c>[RFCXXXX] <xref target="channel_mapping_255"/></c> 1698*a58d3d2aSXin Li</texttable> 1699*a58d3d2aSXin Li<t> 1700*a58d3d2aSXin LiThe designated expert will determine if the Reference points to a specification 1701*a58d3d2aSXin Li that meets the requirements for permanence and ready availability laid out 1702*a58d3d2aSXin Li in <xref target="RFC5226"/> and that it specifies the information 1703*a58d3d2aSXin Li described above with sufficient clarity to allow interoperable 1704*a58d3d2aSXin Li implementations. 1705*a58d3d2aSXin Li</t> 1706*a58d3d2aSXin Li</section> 1707*a58d3d2aSXin Li 1708*a58d3d2aSXin Li<section anchor="Acknowledgments" title="Acknowledgments"> 1709*a58d3d2aSXin Li<t> 1710*a58d3d2aSXin LiThanks to Ben Campbell, Joel M. Halpern, Mark Harris, Greg Maxwell, 1711*a58d3d2aSXin Li Christopher "Monty" Montgomery, Jean-Marc Valin, Stephan Wenger, and Mo Zanaty 1712*a58d3d2aSXin Li for their valuable contributions to this document. 1713*a58d3d2aSXin LiAdditional thanks to Andrew D'Addesio, Greg Maxwell, and Vincent Penquerc'h for 1714*a58d3d2aSXin Li their feedback based on early implementations. 1715*a58d3d2aSXin Li</t> 1716*a58d3d2aSXin Li</section> 1717*a58d3d2aSXin Li 1718*a58d3d2aSXin Li<section title="RFC Editor Notes"> 1719*a58d3d2aSXin Li<t> 1720*a58d3d2aSXin LiIn <xref target="iana"/>, "RFCXXXX" is to be replaced with the RFC number 1721*a58d3d2aSXin Li assigned to this draft. 1722*a58d3d2aSXin Li</t> 1723*a58d3d2aSXin Li</section> 1724*a58d3d2aSXin Li 1725*a58d3d2aSXin Li</middle> 1726*a58d3d2aSXin Li<back> 1727*a58d3d2aSXin Li<references title="Normative References"> 1728*a58d3d2aSXin Li &rfc2119; 1729*a58d3d2aSXin Li &rfc3533; 1730*a58d3d2aSXin Li &rfc3629; 1731*a58d3d2aSXin Li &rfc5226; 1732*a58d3d2aSXin Li &rfc5334; 1733*a58d3d2aSXin Li &rfc6381; 1734*a58d3d2aSXin Li &rfc6716; 1735*a58d3d2aSXin Li 1736*a58d3d2aSXin Li<reference anchor="EBU-R128" target="https://tech.ebu.ch/loudness"> 1737*a58d3d2aSXin Li<front> 1738*a58d3d2aSXin Li <title>Loudness Recommendation EBU R128</title> 1739*a58d3d2aSXin Li <author> 1740*a58d3d2aSXin Li <organization>EBU Technical Committee</organization> 1741*a58d3d2aSXin Li </author> 1742*a58d3d2aSXin Li <date month="August" year="2011"/> 1743*a58d3d2aSXin Li</front> 1744*a58d3d2aSXin Li</reference> 1745*a58d3d2aSXin Li 1746*a58d3d2aSXin Li<reference anchor="vorbis-comment" 1747*a58d3d2aSXin Li target="https://www.xiph.org/vorbis/doc/v-comment.html"> 1748*a58d3d2aSXin Li<front> 1749*a58d3d2aSXin Li<title>Ogg Vorbis I Format Specification: Comment Field and Header 1750*a58d3d2aSXin Li Specification</title> 1751*a58d3d2aSXin Li<author initials="C." surname="Montgomery" 1752*a58d3d2aSXin Li fullname="Christopher "Monty" Montgomery"/> 1753*a58d3d2aSXin Li<date month="July" year="2002"/> 1754*a58d3d2aSXin Li</front> 1755*a58d3d2aSXin Li</reference> 1756*a58d3d2aSXin Li 1757*a58d3d2aSXin Li</references> 1758*a58d3d2aSXin Li 1759*a58d3d2aSXin Li<references title="Informative References"> 1760*a58d3d2aSXin Li 1761*a58d3d2aSXin Li<!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.3550.xml"?--> 1762*a58d3d2aSXin Li &rfc4732; 1763*a58d3d2aSXin Li &rfc6982; 1764*a58d3d2aSXin Li &rfc7587; 1765*a58d3d2aSXin Li 1766*a58d3d2aSXin Li<reference anchor="flac" 1767*a58d3d2aSXin Li target="https://xiph.org/flac/format.html"> 1768*a58d3d2aSXin Li <front> 1769*a58d3d2aSXin Li <title>FLAC - Free Lossless Audio Codec Format Description</title> 1770*a58d3d2aSXin Li <author initials="J." surname="Coalson" fullname="Josh Coalson"/> 1771*a58d3d2aSXin Li <date month="January" year="2008"/> 1772*a58d3d2aSXin Li </front> 1773*a58d3d2aSXin Li</reference> 1774*a58d3d2aSXin Li 1775*a58d3d2aSXin Li<reference anchor="hanning" 1776*a58d3d2aSXin Li target="https://en.wikipedia.org/w/index.php?title=Window_function&oldid=703074467#Hann_.28Hanning.29_window"> 1777*a58d3d2aSXin Li <front> 1778*a58d3d2aSXin Li <title>Hann window</title> 1779*a58d3d2aSXin Li <author> 1780*a58d3d2aSXin Li <organization>Wikipedia</organization> 1781*a58d3d2aSXin Li </author> 1782*a58d3d2aSXin Li <date month="February" year="2016"/> 1783*a58d3d2aSXin Li </front> 1784*a58d3d2aSXin Li</reference> 1785*a58d3d2aSXin Li 1786*a58d3d2aSXin Li<reference anchor="linear-prediction" 1787*a58d3d2aSXin Li target="https://en.wikipedia.org/w/index.php?title=Linear_predictive_coding&oldid=687498962"> 1788*a58d3d2aSXin Li <front> 1789*a58d3d2aSXin Li <title>Linear Predictive Coding</title> 1790*a58d3d2aSXin Li <author> 1791*a58d3d2aSXin Li <organization>Wikipedia</organization> 1792*a58d3d2aSXin Li </author> 1793*a58d3d2aSXin Li <date month="October" year="2015"/> 1794*a58d3d2aSXin Li </front> 1795*a58d3d2aSXin Li</reference> 1796*a58d3d2aSXin Li 1797*a58d3d2aSXin Li<reference anchor="lpc-sample" 1798*a58d3d2aSXin Li target="https://svn.xiph.org/trunk/vorbis/lib/lpc.c"> 1799*a58d3d2aSXin Li<front> 1800*a58d3d2aSXin Li <title>Autocorrelation LPC coeff generation algorithm 1801*a58d3d2aSXin Li (Vorbis source code)</title> 1802*a58d3d2aSXin Li<author initials="J." surname="Degener" fullname="Jutta Degener"/> 1803*a58d3d2aSXin Li<author initials="C." surname="Bormann" fullname="Carsten Bormann"/> 1804*a58d3d2aSXin Li<date month="November" year="1994"/> 1805*a58d3d2aSXin Li</front> 1806*a58d3d2aSXin Li</reference> 1807*a58d3d2aSXin Li 1808*a58d3d2aSXin Li<reference anchor="q-notation" 1809*a58d3d2aSXin Li target="https://en.wikipedia.org/w/index.php?title=Q_%28number_format%29&oldid=697252615"> 1810*a58d3d2aSXin Li<front> 1811*a58d3d2aSXin Li<title>Q (number format)</title> 1812*a58d3d2aSXin Li<author><organization>Wikipedia</organization></author> 1813*a58d3d2aSXin Li<date month="December" year="2015"/> 1814*a58d3d2aSXin Li</front> 1815*a58d3d2aSXin Li</reference> 1816*a58d3d2aSXin Li 1817*a58d3d2aSXin Li<reference anchor="replay-gain" 1818*a58d3d2aSXin Li target="https://wiki.xiph.org/VorbisComment#Replay_Gain"> 1819*a58d3d2aSXin Li<front> 1820*a58d3d2aSXin Li<title>VorbisComment: Replay Gain</title> 1821*a58d3d2aSXin Li<author initials="C." surname="Parker" fullname="Conrad Parker"/> 1822*a58d3d2aSXin Li<author initials="M." surname="Leese" fullname="Martin Leese"/> 1823*a58d3d2aSXin Li<date month="June" year="2009"/> 1824*a58d3d2aSXin Li</front> 1825*a58d3d2aSXin Li</reference> 1826*a58d3d2aSXin Li 1827*a58d3d2aSXin Li<reference anchor="seeking" 1828*a58d3d2aSXin Li target="https://wiki.xiph.org/Seeking"> 1829*a58d3d2aSXin Li<front> 1830*a58d3d2aSXin Li<title>Granulepos Encoding and How Seeking Really Works</title> 1831*a58d3d2aSXin Li<author initials="S." surname="Pfeiffer" fullname="Silvia Pfeiffer"/> 1832*a58d3d2aSXin Li<author initials="C." surname="Parker" fullname="Conrad Parker"/> 1833*a58d3d2aSXin Li<author initials="G." surname="Maxwell" fullname="Greg Maxwell"/> 1834*a58d3d2aSXin Li<date month="May" year="2012"/> 1835*a58d3d2aSXin Li</front> 1836*a58d3d2aSXin Li</reference> 1837*a58d3d2aSXin Li 1838*a58d3d2aSXin Li<reference anchor="vorbis-mapping" 1839*a58d3d2aSXin Li target="https://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-810004.3.9"> 1840*a58d3d2aSXin Li<front> 1841*a58d3d2aSXin Li<title>The Vorbis I Specification, Section 4.3.9 Output Channel Order</title> 1842*a58d3d2aSXin Li<author initials="C." surname="Montgomery" 1843*a58d3d2aSXin Li fullname="Christopher "Monty" Montgomery"/> 1844*a58d3d2aSXin Li<date month="January" year="2010"/> 1845*a58d3d2aSXin Li</front> 1846*a58d3d2aSXin Li</reference> 1847*a58d3d2aSXin Li 1848*a58d3d2aSXin Li<reference anchor="vorbis-trim" 1849*a58d3d2aSXin Li target="https://xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-132000A.2"> 1850*a58d3d2aSXin Li <front> 1851*a58d3d2aSXin Li <title>The Vorbis I Specification, Appendix A: Embedding Vorbis 1852*a58d3d2aSXin Li into an Ogg stream</title> 1853*a58d3d2aSXin Li <author initials="C." surname="Montgomery" 1854*a58d3d2aSXin Li fullname="Christopher "Monty" Montgomery"/> 1855*a58d3d2aSXin Li <date month="November" year="2008"/> 1856*a58d3d2aSXin Li </front> 1857*a58d3d2aSXin Li</reference> 1858*a58d3d2aSXin Li 1859*a58d3d2aSXin Li<reference anchor="wave-multichannel" 1860*a58d3d2aSXin Li target="http://msdn.microsoft.com/en-us/windows/hardware/gg463006.aspx"> 1861*a58d3d2aSXin Li <front> 1862*a58d3d2aSXin Li <title>Multiple Channel Audio Data and WAVE Files</title> 1863*a58d3d2aSXin Li <author> 1864*a58d3d2aSXin Li <organization>Microsoft Corporation</organization> 1865*a58d3d2aSXin Li </author> 1866*a58d3d2aSXin Li <date month="March" year="2007"/> 1867*a58d3d2aSXin Li </front> 1868*a58d3d2aSXin Li</reference> 1869*a58d3d2aSXin Li 1870*a58d3d2aSXin Li</references> 1871*a58d3d2aSXin Li 1872*a58d3d2aSXin Li</back> 1873*a58d3d2aSXin Li</rfc> 1874