xref: /aosp_15_r20/external/libopus/doc/draft-ietf-codec-oggopus.xml (revision a58d3d2adb790c104798cd88c8a3aff4fa8b82cc)
1*a58d3d2aSXin Li<?xml version="1.0" encoding="utf-8"?>
2*a58d3d2aSXin Li<!--
3*a58d3d2aSXin Li   Copyright (c) 2012-2016 Xiph.Org Foundation and contributors
4*a58d3d2aSXin Li
5*a58d3d2aSXin Li   Redistribution and use in source and binary forms, with or without
6*a58d3d2aSXin Li   modification, are permitted provided that the following conditions
7*a58d3d2aSXin Li   are met:
8*a58d3d2aSXin Li
9*a58d3d2aSXin Li   - Redistributions of source code must retain the above copyright
10*a58d3d2aSXin Li   notice, this list of conditions and the following disclaimer.
11*a58d3d2aSXin Li
12*a58d3d2aSXin Li   - Redistributions in binary form must reproduce the above copyright
13*a58d3d2aSXin Li   notice, this list of conditions and the following disclaimer in the
14*a58d3d2aSXin Li   documentation and/or other materials provided with the distribution.
15*a58d3d2aSXin Li
16*a58d3d2aSXin Li   THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS
17*a58d3d2aSXin Li   ``AS IS'' AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT
18*a58d3d2aSXin Li   LIMITED TO, THE IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR
19*a58d3d2aSXin Li   A PARTICULAR PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT OWNER
20*a58d3d2aSXin Li   OR CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL,
21*a58d3d2aSXin Li   EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO,
22*a58d3d2aSXin Li   PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR
23*a58d3d2aSXin Li   PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF
24*a58d3d2aSXin Li   LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING
25*a58d3d2aSXin Li   NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS
26*a58d3d2aSXin Li   SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE.
27*a58d3d2aSXin Li
28*a58d3d2aSXin Li   Special permission is granted to remove the above copyright notice, list of
29*a58d3d2aSXin Li   conditions, and disclaimer when submitting this document, with or without
30*a58d3d2aSXin Li   modification, to the IETF.
31*a58d3d2aSXin Li-->
32*a58d3d2aSXin Li<!DOCTYPE rfc SYSTEM 'rfc2629.dtd' [
33*a58d3d2aSXin Li<!ENTITY rfc2119 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.2119.xml'>
34*a58d3d2aSXin Li<!ENTITY rfc3533 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3533.xml'>
35*a58d3d2aSXin Li<!ENTITY rfc3629 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.3629.xml'>
36*a58d3d2aSXin Li<!ENTITY rfc4732 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.4732.xml'>
37*a58d3d2aSXin Li<!ENTITY rfc5226 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5226.xml'>
38*a58d3d2aSXin Li<!ENTITY rfc5334 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.5334.xml'>
39*a58d3d2aSXin Li<!ENTITY rfc6381 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6381.xml'>
40*a58d3d2aSXin Li<!ENTITY rfc6716 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6716.xml'>
41*a58d3d2aSXin Li<!ENTITY rfc6982 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.6982.xml'>
42*a58d3d2aSXin Li<!ENTITY rfc7587 PUBLIC '' 'http://xml.resource.org/public/rfc/bibxml/reference.RFC.7587.xml'>
43*a58d3d2aSXin Li]>
44*a58d3d2aSXin Li<?rfc toc="yes" symrefs="yes" ?>
45*a58d3d2aSXin Li
46*a58d3d2aSXin Li<rfc ipr="trust200902" category="std" docName="draft-ietf-codec-oggopus-14"
47*a58d3d2aSXin Li updates="5334">
48*a58d3d2aSXin Li
49*a58d3d2aSXin Li<front>
50*a58d3d2aSXin Li<title abbrev="Ogg Opus">Ogg Encapsulation for the Opus Audio Codec</title>
51*a58d3d2aSXin Li<author initials="T.B." surname="Terriberry" fullname="Timothy B. Terriberry">
52*a58d3d2aSXin Li<organization>Mozilla Corporation</organization>
53*a58d3d2aSXin Li<address>
54*a58d3d2aSXin Li<postal>
55*a58d3d2aSXin Li<street>650 Castro Street</street>
56*a58d3d2aSXin Li<city>Mountain View</city>
57*a58d3d2aSXin Li<region>CA</region>
58*a58d3d2aSXin Li<code>94041</code>
59*a58d3d2aSXin Li<country>USA</country>
60*a58d3d2aSXin Li</postal>
61*a58d3d2aSXin Li<phone>+1 650 903-0800</phone>
62*a58d3d2aSXin Li<email>[email protected]</email>
63*a58d3d2aSXin Li</address>
64*a58d3d2aSXin Li</author>
65*a58d3d2aSXin Li
66*a58d3d2aSXin Li<author initials="R." surname="Lee" fullname="Ron Lee">
67*a58d3d2aSXin Li<organization>Voicetronix</organization>
68*a58d3d2aSXin Li<address>
69*a58d3d2aSXin Li<postal>
70*a58d3d2aSXin Li<street>246 Pulteney Street, Level 1</street>
71*a58d3d2aSXin Li<city>Adelaide</city>
72*a58d3d2aSXin Li<region>SA</region>
73*a58d3d2aSXin Li<code>5000</code>
74*a58d3d2aSXin Li<country>Australia</country>
75*a58d3d2aSXin Li</postal>
76*a58d3d2aSXin Li<phone>+61 8 8232 9112</phone>
77*a58d3d2aSXin Li<email>[email protected]</email>
78*a58d3d2aSXin Li</address>
79*a58d3d2aSXin Li</author>
80*a58d3d2aSXin Li
81*a58d3d2aSXin Li<author initials="R." surname="Giles" fullname="Ralph Giles">
82*a58d3d2aSXin Li<organization>Mozilla Corporation</organization>
83*a58d3d2aSXin Li<address>
84*a58d3d2aSXin Li<postal>
85*a58d3d2aSXin Li<street>163 West Hastings Street</street>
86*a58d3d2aSXin Li<city>Vancouver</city>
87*a58d3d2aSXin Li<region>BC</region>
88*a58d3d2aSXin Li<code>V6B 1H5</code>
89*a58d3d2aSXin Li<country>Canada</country>
90*a58d3d2aSXin Li</postal>
91*a58d3d2aSXin Li<phone>+1 778 785 1540</phone>
92*a58d3d2aSXin Li<email>[email protected]</email>
93*a58d3d2aSXin Li</address>
94*a58d3d2aSXin Li</author>
95*a58d3d2aSXin Li
96*a58d3d2aSXin Li<date day="22" month="February" year="2016"/>
97*a58d3d2aSXin Li<area>RAI</area>
98*a58d3d2aSXin Li<workgroup>codec</workgroup>
99*a58d3d2aSXin Li
100*a58d3d2aSXin Li<abstract>
101*a58d3d2aSXin Li<t>
102*a58d3d2aSXin LiThis document defines the Ogg encapsulation for the Opus interactive speech and
103*a58d3d2aSXin Li audio codec.
104*a58d3d2aSXin LiThis allows data encoded in the Opus format to be stored in an Ogg logical
105*a58d3d2aSXin Li bitstream.
106*a58d3d2aSXin Li</t>
107*a58d3d2aSXin Li</abstract>
108*a58d3d2aSXin Li</front>
109*a58d3d2aSXin Li
110*a58d3d2aSXin Li<middle>
111*a58d3d2aSXin Li<section anchor="intro" title="Introduction">
112*a58d3d2aSXin Li<t>
113*a58d3d2aSXin LiThe IETF Opus codec is a low-latency audio codec optimized for both voice and
114*a58d3d2aSXin Li general-purpose audio.
115*a58d3d2aSXin LiSee <xref target="RFC6716"/> for technical details.
116*a58d3d2aSXin LiThis document defines the encapsulation of Opus in a continuous, logical Ogg
117*a58d3d2aSXin Li bitstream&nbsp;<xref target="RFC3533"/>.
118*a58d3d2aSXin LiOgg encapsulation provides Opus with a long-term storage format supporting
119*a58d3d2aSXin Li all of the essential features, including metadata, fast and accurate seeking,
120*a58d3d2aSXin Li corruption detection, recapture after errors, low overhead, and the ability to
121*a58d3d2aSXin Li multiplex Opus with other codecs (including video) with minimal buffering.
122*a58d3d2aSXin LiIt also provides a live streamable format, capable of delivery over a reliable
123*a58d3d2aSXin Li stream-oriented transport, without requiring all the data, or even the total
124*a58d3d2aSXin Li length of the data, up-front, in a form that is identical to the on-disk
125*a58d3d2aSXin Li storage format.
126*a58d3d2aSXin Li</t>
127*a58d3d2aSXin Li<t>
128*a58d3d2aSXin LiOgg bitstreams are made up of a series of 'pages', each of which contains data
129*a58d3d2aSXin Li from one or more 'packets'.
130*a58d3d2aSXin LiPages are the fundamental unit of multiplexing in an Ogg stream.
131*a58d3d2aSXin LiEach page is associated with a particular logical stream and contains a capture
132*a58d3d2aSXin Li pattern and checksum, flags to mark the beginning and end of the logical
133*a58d3d2aSXin Li stream, and a 'granule position' that represents an absolute position in the
134*a58d3d2aSXin Li stream, to aid seeking.
135*a58d3d2aSXin LiA single page can contain up to 65,025 octets of packet data from up to 255
136*a58d3d2aSXin Li different packets.
137*a58d3d2aSXin LiPackets can be split arbitrarily across pages, and continued from one page to
138*a58d3d2aSXin Li the next (allowing packets much larger than would fit on a single page).
139*a58d3d2aSXin LiEach page contains 'lacing values' that indicate how the data is partitioned
140*a58d3d2aSXin Li into packets, allowing a demultiplexer (demuxer) to recover the packet
141*a58d3d2aSXin Li boundaries without examining the encoded data.
142*a58d3d2aSXin LiA packet is said to 'complete' on a page when the page contains the final
143*a58d3d2aSXin Li lacing value corresponding to that packet.
144*a58d3d2aSXin Li</t>
145*a58d3d2aSXin Li<t>
146*a58d3d2aSXin LiThis encapsulation defines the contents of the packet data, including
147*a58d3d2aSXin Li the necessary headers, the organization of those packets into a logical
148*a58d3d2aSXin Li stream, and the interpretation of the codec-specific granule position field.
149*a58d3d2aSXin LiIt does not attempt to describe or specify the existing Ogg container format.
150*a58d3d2aSXin LiReaders unfamiliar with the basic concepts mentioned above are encouraged to
151*a58d3d2aSXin Li review the details in <xref target="RFC3533"/>.
152*a58d3d2aSXin Li</t>
153*a58d3d2aSXin Li
154*a58d3d2aSXin Li</section>
155*a58d3d2aSXin Li
156*a58d3d2aSXin Li<section anchor="terminology" title="Terminology">
157*a58d3d2aSXin Li<t>
158*a58d3d2aSXin LiThe key words "MUST", "MUST NOT", "REQUIRED", "SHALL", "SHALL NOT", "SHOULD",
159*a58d3d2aSXin Li "SHOULD NOT", "RECOMMENDED", "NOT RECOMMENDED", "MAY", and "OPTIONAL" in this
160*a58d3d2aSXin Li document are to be interpreted as described in <xref target="RFC2119"/>.
161*a58d3d2aSXin Li</t>
162*a58d3d2aSXin Li
163*a58d3d2aSXin Li</section>
164*a58d3d2aSXin Li
165*a58d3d2aSXin Li<section anchor="packet_organization" title="Packet Organization">
166*a58d3d2aSXin Li<t>
167*a58d3d2aSXin LiAn Ogg Opus stream is organized as follows (see
168*a58d3d2aSXin Li <xref target="packet-org-example"/> for an example).
169*a58d3d2aSXin Li</t>
170*a58d3d2aSXin Li
171*a58d3d2aSXin Li<figure anchor="packet-org-example"
172*a58d3d2aSXin Li title="Example packet organization for a logical Ogg Opus stream"
173*a58d3d2aSXin Li align="center">
174*a58d3d2aSXin Li<artwork align="center"><![CDATA[
175*a58d3d2aSXin Li    Page 0         Pages 1 ... n        Pages (n+1) ...
176*a58d3d2aSXin Li +------------+ +---+ +---+ ... +---+ +-----------+ +---------+ +--
177*a58d3d2aSXin Li |            | |   | |   |     |   | |           | |         | |
178*a58d3d2aSXin Li |+----------+| |+-----------------+| |+-------------------+ +-----
179*a58d3d2aSXin Li |||ID Header|| ||  Comment Header || ||Audio Data Packet 1| | ...
180*a58d3d2aSXin Li |+----------+| |+-----------------+| |+-------------------+ +-----
181*a58d3d2aSXin Li |            | |   | |   |     |   | |           | |         | |
182*a58d3d2aSXin Li +------------+ +---+ +---+ ... +---+ +-----------+ +---------+ +--
183*a58d3d2aSXin Li ^      ^                           ^
184*a58d3d2aSXin Li |      |                           |
185*a58d3d2aSXin Li |      |                           Mandatory Page Break
186*a58d3d2aSXin Li |      |
187*a58d3d2aSXin Li |      ID header is contained on a single page
188*a58d3d2aSXin Li |
189*a58d3d2aSXin Li 'Beginning Of Stream'
190*a58d3d2aSXin Li]]></artwork>
191*a58d3d2aSXin Li</figure>
192*a58d3d2aSXin Li
193*a58d3d2aSXin Li<t>
194*a58d3d2aSXin LiThere are two mandatory header packets.
195*a58d3d2aSXin LiThe first packet in the logical Ogg bitstream MUST contain the identification
196*a58d3d2aSXin Li (ID) header, which uniquely identifies a stream as Opus audio.
197*a58d3d2aSXin LiThe format of this header is defined in <xref target="id_header"/>.
198*a58d3d2aSXin LiIt is placed alone (without any other packet data) on the first page of
199*a58d3d2aSXin Li the logical Ogg bitstream, and completes on that page.
200*a58d3d2aSXin LiThis page has its 'beginning of stream' flag set.
201*a58d3d2aSXin Li</t>
202*a58d3d2aSXin Li<t>
203*a58d3d2aSXin LiThe second packet in the logical Ogg bitstream MUST contain the comment header,
204*a58d3d2aSXin Li which contains user-supplied metadata.
205*a58d3d2aSXin LiThe format of this header is defined in <xref target="comment_header"/>.
206*a58d3d2aSXin LiIt MAY span multiple pages, beginning on the second page of the logical
207*a58d3d2aSXin Li stream.
208*a58d3d2aSXin LiHowever many pages it spans, the comment header packet MUST finish the page on
209*a58d3d2aSXin Li which it completes.
210*a58d3d2aSXin Li</t>
211*a58d3d2aSXin Li<t>
212*a58d3d2aSXin LiAll subsequent pages are audio data pages, and the Ogg packets they contain are
213*a58d3d2aSXin Li audio data packets.
214*a58d3d2aSXin LiEach audio data packet contains one Opus packet for each of N different
215*a58d3d2aSXin Li streams, where N is typically one for mono or stereo, but MAY be greater than
216*a58d3d2aSXin Li one for multichannel audio.
217*a58d3d2aSXin LiThe value N is specified in the ID header (see
218*a58d3d2aSXin Li <xref target="channel_mapping"/>), and is fixed over the entire length of the
219*a58d3d2aSXin Li logical Ogg bitstream.
220*a58d3d2aSXin Li</t>
221*a58d3d2aSXin Li<t>
222*a58d3d2aSXin LiThe first (N&nbsp;-&nbsp;1) Opus packets, if any, are packed one after another
223*a58d3d2aSXin Li into the Ogg packet, using the self-delimiting framing from Appendix&nbsp;B of
224*a58d3d2aSXin Li <xref target="RFC6716"/>.
225*a58d3d2aSXin LiThe remaining Opus packet is packed at the end of the Ogg packet using the
226*a58d3d2aSXin Li regular, undelimited framing from Section&nbsp;3 of <xref target="RFC6716"/>.
227*a58d3d2aSXin LiAll of the Opus packets in a single Ogg packet MUST be constrained to have the
228*a58d3d2aSXin Li same duration.
229*a58d3d2aSXin LiAn implementation of this specification SHOULD treat any Opus packet whose
230*a58d3d2aSXin Li duration is different from that of the first Opus packet in an Ogg packet as
231*a58d3d2aSXin Li if it were a malformed Opus packet with an invalid Table Of Contents (TOC)
232*a58d3d2aSXin Li sequence.
233*a58d3d2aSXin Li</t>
234*a58d3d2aSXin Li<t>
235*a58d3d2aSXin LiThe TOC sequence at the beginning of each Opus packet indicates the coding
236*a58d3d2aSXin Li mode, audio bandwidth, channel count, duration (frame size), and number of
237*a58d3d2aSXin Li frames per packet, as described in Section&nbsp;3.1
238*a58d3d2aSXin Li of&nbsp;<xref target="RFC6716"/>.
239*a58d3d2aSXin LiThe coding mode is one of SILK, Hybrid, or Constrained Energy Lapped Transform
240*a58d3d2aSXin Li (CELT).
241*a58d3d2aSXin LiThe combination of coding mode, audio bandwidth, and frame size is referred to
242*a58d3d2aSXin Li as the configuration of an Opus packet.
243*a58d3d2aSXin Li</t>
244*a58d3d2aSXin Li<t>
245*a58d3d2aSXin LiPackets are placed into Ogg pages in order until the end of stream.
246*a58d3d2aSXin LiAudio data packets might span page boundaries.
247*a58d3d2aSXin LiThe first audio data page could have the 'continued packet' flag set
248*a58d3d2aSXin Li (indicating the first audio data packet is continued from a previous page) if,
249*a58d3d2aSXin Li for example, it was a live stream joined mid-broadcast, with the headers
250*a58d3d2aSXin Li pasted on the front.
251*a58d3d2aSXin LiIf a page has the 'continued packet' flag set and one of the following
252*a58d3d2aSXin Li conditions is also true:
253*a58d3d2aSXin Li<list style="symbols">
254*a58d3d2aSXin Li<t>the previous page with packet data does not end in a continued packet (does
255*a58d3d2aSXin Li not end with a lacing value of 255) OR</t>
256*a58d3d2aSXin Li<t>the page sequence numbers are not consecutive,</t>
257*a58d3d2aSXin Li</list>
258*a58d3d2aSXin Li then a demuxer MUST NOT attempt to decode the data for the first packet on the
259*a58d3d2aSXin Li page unless the demuxer has some special knowledge that would allow it to
260*a58d3d2aSXin Li interpret this data despite the missing pieces.
261*a58d3d2aSXin LiAn implementation MUST treat a zero-octet audio data packet as if it were a
262*a58d3d2aSXin Li malformed Opus packet as described in
263*a58d3d2aSXin Li Section&nbsp;3.4 of&nbsp;<xref target="RFC6716"/>.
264*a58d3d2aSXin Li</t>
265*a58d3d2aSXin Li<t>
266*a58d3d2aSXin LiA logical stream ends with a page with the 'end of stream' flag set, but
267*a58d3d2aSXin Li implementations need to be prepared to deal with truncated streams that do not
268*a58d3d2aSXin Li have a page marked 'end of stream'.
269*a58d3d2aSXin LiThere is no reason for the final packet on the last page to be a continued
270*a58d3d2aSXin Li packet, i.e., for the final lacing value to be 255.
271*a58d3d2aSXin LiHowever, demuxers might encounter such streams, possibly as the result of a
272*a58d3d2aSXin Li transfer that did not complete or of corruption.
273*a58d3d2aSXin LiIf a packet continues onto a subsequent page (i.e., when the page ends with a
274*a58d3d2aSXin Li lacing value of 255) and one of the following conditions is also true:
275*a58d3d2aSXin Li<list style="symbols">
276*a58d3d2aSXin Li<t>the next page with packet data does not have the 'continued packet' flag
277*a58d3d2aSXin Li set OR</t>
278*a58d3d2aSXin Li<t>there is no next page with packet data OR</t>
279*a58d3d2aSXin Li<t>the page sequence numbers are not consecutive,</t>
280*a58d3d2aSXin Li</list>
281*a58d3d2aSXin Li then a demuxer MUST NOT attempt to decode the data from that packet unless the
282*a58d3d2aSXin Li demuxer has some special knowledge that would allow it to interpret this data
283*a58d3d2aSXin Li despite the missing pieces.
284*a58d3d2aSXin LiThere MUST NOT be any more pages in an Opus logical bitstream after a page
285*a58d3d2aSXin Li marked 'end of stream'.
286*a58d3d2aSXin Li</t>
287*a58d3d2aSXin Li</section>
288*a58d3d2aSXin Li
289*a58d3d2aSXin Li<section anchor="granpos" title="Granule Position">
290*a58d3d2aSXin Li<t>
291*a58d3d2aSXin LiThe granule position MUST be zero for the ID header page and the
292*a58d3d2aSXin Li page where the comment header completes.
293*a58d3d2aSXin LiThat is, the first page in the logical stream, and the last header
294*a58d3d2aSXin Li page before the first audio data page both have a granule position of zero.
295*a58d3d2aSXin Li</t>
296*a58d3d2aSXin Li<t>
297*a58d3d2aSXin LiThe granule position of an audio data page encodes the total number of PCM
298*a58d3d2aSXin Li samples in the stream up to and including the last fully-decodable sample from
299*a58d3d2aSXin Li the last packet completed on that page.
300*a58d3d2aSXin LiThe granule position of the first audio data page will usually be larger than
301*a58d3d2aSXin Li zero, as described in <xref target="start_granpos_restrictions"/>.
302*a58d3d2aSXin Li</t>
303*a58d3d2aSXin Li
304*a58d3d2aSXin Li<t>
305*a58d3d2aSXin LiA page that is entirely spanned by a single packet (that completes on a
306*a58d3d2aSXin Li subsequent page) has no granule position, and the granule position field is
307*a58d3d2aSXin Li set to the special value '-1' in two's complement.
308*a58d3d2aSXin Li</t>
309*a58d3d2aSXin Li
310*a58d3d2aSXin Li<t>
311*a58d3d2aSXin LiThe granule position of an audio data page is in units of PCM audio samples at
312*a58d3d2aSXin Li a fixed rate of 48&nbsp;kHz (per channel; a stereo stream's granule position
313*a58d3d2aSXin Li does not increment at twice the speed of a mono stream).
314*a58d3d2aSXin LiIt is possible to run an Opus decoder at other sampling rates,
315*a58d3d2aSXin Li but all Opus packets encode samples at a sampling rate that evenly divides
316*a58d3d2aSXin Li 48&nbsp;kHz.
317*a58d3d2aSXin LiTherefore, the value in the granule position field always counts samples
318*a58d3d2aSXin Li assuming a 48&nbsp;kHz decoding rate, and the rest of this specification makes
319*a58d3d2aSXin Li the same assumption.
320*a58d3d2aSXin Li</t>
321*a58d3d2aSXin Li
322*a58d3d2aSXin Li<t>
323*a58d3d2aSXin LiThe duration of an Opus packet as defined in <xref target="RFC6716"/> can be
324*a58d3d2aSXin Li any multiple of 2.5&nbsp;ms, up to a maximum of 120&nbsp;ms.
325*a58d3d2aSXin LiThis duration is encoded in the TOC sequence at the beginning of each packet.
326*a58d3d2aSXin LiThe number of samples returned by a decoder corresponds to this duration
327*a58d3d2aSXin Li exactly, even for the first few packets.
328*a58d3d2aSXin LiFor example, a 20&nbsp;ms packet fed to a decoder running at 48&nbsp;kHz will
329*a58d3d2aSXin Li always return 960&nbsp;samples.
330*a58d3d2aSXin LiA demuxer can parse the TOC sequence at the beginning of each Ogg packet to
331*a58d3d2aSXin Li work backwards or forwards from a packet with a known granule position (i.e.,
332*a58d3d2aSXin Li the last packet completed on some page) in order to assign granule positions
333*a58d3d2aSXin Li to every packet, or even every individual sample.
334*a58d3d2aSXin LiThe one exception is the last page in the stream, as described below.
335*a58d3d2aSXin Li</t>
336*a58d3d2aSXin Li
337*a58d3d2aSXin Li<t>
338*a58d3d2aSXin LiAll other pages with completed packets after the first MUST have a granule
339*a58d3d2aSXin Li position equal to the number of samples contained in packets that complete on
340*a58d3d2aSXin Li that page plus the granule position of the most recent page with completed
341*a58d3d2aSXin Li packets.
342*a58d3d2aSXin LiThis guarantees that a demuxer can assign individual packets the same granule
343*a58d3d2aSXin Li position when working forwards as when working backwards.
344*a58d3d2aSXin LiFor this to work, there cannot be any gaps.
345*a58d3d2aSXin Li</t>
346*a58d3d2aSXin Li
347*a58d3d2aSXin Li<section anchor="gap-repair" title="Repairing Gaps in Real-time Streams">
348*a58d3d2aSXin Li<t>
349*a58d3d2aSXin LiIn order to support capturing a real-time stream that has lost or not
350*a58d3d2aSXin Li transmitted packets, a multiplexer (muxer) SHOULD emit packets that explicitly
351*a58d3d2aSXin Li request the use of Packet Loss Concealment (PLC) in place of the missing
352*a58d3d2aSXin Li packets.
353*a58d3d2aSXin LiImplementations that fail to do so still MUST NOT increment the granule
354*a58d3d2aSXin Li position for a page by anything other than the number of samples contained in
355*a58d3d2aSXin Li packets that actually complete on that page.
356*a58d3d2aSXin Li</t>
357*a58d3d2aSXin Li<t>
358*a58d3d2aSXin LiOnly gaps that are a multiple of 2.5&nbsp;ms are repairable, as these are the
359*a58d3d2aSXin Li only durations that can be created by packet loss or discontinuous
360*a58d3d2aSXin Li transmission.
361*a58d3d2aSXin LiMuxers need not handle other gap sizes.
362*a58d3d2aSXin LiCreating the necessary packets involves synthesizing a TOC byte (defined in
363*a58d3d2aSXin LiSection&nbsp;3.1 of&nbsp;<xref target="RFC6716"/>)&mdash;and whatever
364*a58d3d2aSXin Li additional internal framing is needed&mdash;to indicate the packet duration
365*a58d3d2aSXin Li for each stream.
366*a58d3d2aSXin LiThe actual length of each missing Opus frame inside the packet is zero bytes,
367*a58d3d2aSXin Li as defined in Section&nbsp;3.2.1 of&nbsp;<xref target="RFC6716"/>.
368*a58d3d2aSXin Li</t>
369*a58d3d2aSXin Li
370*a58d3d2aSXin Li<t>
371*a58d3d2aSXin LiZero-byte frames MAY be packed into packets using any of codes&nbsp;0, 1,
372*a58d3d2aSXin Li 2, or&nbsp;3.
373*a58d3d2aSXin LiWhen successive frames have the same configuration, the higher code packings
374*a58d3d2aSXin Li reduce overhead.
375*a58d3d2aSXin LiLikewise, if the TOC configuration matches, the muxer MAY further combine the
376*a58d3d2aSXin Li empty frames with previous or subsequent non-zero-length frames (using
377*a58d3d2aSXin Li code&nbsp;2 or VBR code&nbsp;3).
378*a58d3d2aSXin Li</t>
379*a58d3d2aSXin Li
380*a58d3d2aSXin Li<t>
381*a58d3d2aSXin Li<xref target="RFC6716"/> does not impose any requirements on the PLC, but this
382*a58d3d2aSXin Li section outlines choices that are expected to have a positive influence on
383*a58d3d2aSXin Li most PLC implementations, including the reference implementation.
384*a58d3d2aSXin LiSynthesized TOC sequences SHOULD maintain the same mode, audio bandwidth,
385*a58d3d2aSXin Li channel count, and frame size as the previous packet (if any).
386*a58d3d2aSXin LiThis is the simplest and usually the most well-tested case for the PLC to
387*a58d3d2aSXin Li handle and it covers all losses that do not include a configuration switch,
388*a58d3d2aSXin Li as defined in Section&nbsp;4.5 of&nbsp;<xref target="RFC6716"/>.
389*a58d3d2aSXin Li</t>
390*a58d3d2aSXin Li
391*a58d3d2aSXin Li<t>
392*a58d3d2aSXin LiWhen a previous packet is available, keeping the audio bandwidth and channel
393*a58d3d2aSXin Li count the same allows the PLC to provide maximum continuity in the concealment
394*a58d3d2aSXin Li data it generates.
395*a58d3d2aSXin LiHowever, if the size of the gap is not a multiple of the most recent frame
396*a58d3d2aSXin Li size, then the frame size will have to change for at least some frames.
397*a58d3d2aSXin LiSuch changes SHOULD be delayed as long as possible to simplify
398*a58d3d2aSXin Li things for PLC implementations.
399*a58d3d2aSXin Li</t>
400*a58d3d2aSXin Li
401*a58d3d2aSXin Li<t>
402*a58d3d2aSXin LiAs an example, a 95&nbsp;ms gap could be encoded as nineteen 5&nbsp;ms frames
403*a58d3d2aSXin Li in two bytes with a single CBR code&nbsp;3 packet.
404*a58d3d2aSXin LiIf the previous frame size was 20&nbsp;ms, using four 20&nbsp;ms frames
405*a58d3d2aSXin Li followed by three 5&nbsp;ms frames requires 4&nbsp;bytes (plus an extra byte
406*a58d3d2aSXin Li of Ogg lacing overhead), but allows the PLC to use its well-tested steady
407*a58d3d2aSXin Li state behavior for as long as possible.
408*a58d3d2aSXin LiThe total bitrate of the latter approach, including Ogg overhead, is about
409*a58d3d2aSXin Li 0.4&nbsp;kbps, so the impact on file size is minimal.
410*a58d3d2aSXin Li</t>
411*a58d3d2aSXin Li
412*a58d3d2aSXin Li<t>
413*a58d3d2aSXin LiChanging modes is discouraged, since this causes some decoder implementations
414*a58d3d2aSXin Li to reset their PLC state.
415*a58d3d2aSXin LiHowever, SILK and Hybrid mode frames cannot fill gaps that are not a multiple
416*a58d3d2aSXin Li of 10&nbsp;ms.
417*a58d3d2aSXin LiIf switching to CELT mode is needed to match the gap size, a muxer SHOULD do
418*a58d3d2aSXin Li so at the end of the gap to allow the PLC to function for as long as possible.
419*a58d3d2aSXin Li</t>
420*a58d3d2aSXin Li
421*a58d3d2aSXin Li<t>
422*a58d3d2aSXin LiIn the example above, if the previous frame was a 20&nbsp;ms SILK mode frame,
423*a58d3d2aSXin Li the better solution is to synthesize a packet describing four 20&nbsp;ms SILK
424*a58d3d2aSXin Li frames, followed by a packet with a single 10&nbsp;ms SILK
425*a58d3d2aSXin Li frame, and finally a packet with a 5&nbsp;ms CELT frame, to fill the 95&nbsp;ms
426*a58d3d2aSXin Li gap.
427*a58d3d2aSXin LiThis also requires four bytes to describe the synthesized packet data (two
428*a58d3d2aSXin Li bytes for a CBR code 3 and one byte each for two code 0 packets) but three
429*a58d3d2aSXin Li bytes of Ogg lacing overhead are needed to mark the packet boundaries.
430*a58d3d2aSXin LiAt 0.6 kbps, this is still a minimal bitrate impact over a naive, low quality
431*a58d3d2aSXin Li solution.
432*a58d3d2aSXin Li</t>
433*a58d3d2aSXin Li
434*a58d3d2aSXin Li<t>
435*a58d3d2aSXin LiSince medium-band audio is an option only in the SILK mode, wideband frames
436*a58d3d2aSXin Li SHOULD be generated if switching from that configuration to CELT mode, to
437*a58d3d2aSXin Li ensure that any PLC implementation which does try to migrate state between
438*a58d3d2aSXin Li the modes will be able to preserve all of the available audio bandwidth.
439*a58d3d2aSXin Li</t>
440*a58d3d2aSXin Li
441*a58d3d2aSXin Li</section>
442*a58d3d2aSXin Li
443*a58d3d2aSXin Li<section anchor="preskip" title="Pre-skip">
444*a58d3d2aSXin Li<t>
445*a58d3d2aSXin LiThere is some amount of latency introduced during the decoding process, to
446*a58d3d2aSXin Li allow for overlap in the CELT mode, stereo mixing in the SILK mode, and
447*a58d3d2aSXin Li resampling.
448*a58d3d2aSXin LiThe encoder might have introduced additional latency through its own resampling
449*a58d3d2aSXin Li and analysis (though the exact amount is not specified).
450*a58d3d2aSXin LiTherefore, the first few samples produced by the decoder do not correspond to
451*a58d3d2aSXin Li real input audio, but are instead composed of padding inserted by the encoder
452*a58d3d2aSXin Li to compensate for this latency.
453*a58d3d2aSXin LiThese samples need to be stored and decoded, as Opus is an asymptotically
454*a58d3d2aSXin Li convergent predictive codec, meaning the decoded contents of each frame depend
455*a58d3d2aSXin Li on the recent history of decoder inputs.
456*a58d3d2aSXin LiHowever, a player will want to skip these samples after decoding them.
457*a58d3d2aSXin Li</t>
458*a58d3d2aSXin Li
459*a58d3d2aSXin Li<t>
460*a58d3d2aSXin LiA 'pre-skip' field in the ID header (see <xref target="id_header"/>) signals
461*a58d3d2aSXin Li the number of samples that SHOULD be skipped (decoded but discarded) at the
462*a58d3d2aSXin Li beginning of the stream, though some specific applications might have a reason
463*a58d3d2aSXin Li for looking at that data.
464*a58d3d2aSXin LiThis amount need not be a multiple of 2.5&nbsp;ms, MAY be smaller than a single
465*a58d3d2aSXin Li packet, or MAY span the contents of several packets.
466*a58d3d2aSXin LiThese samples are not valid audio.
467*a58d3d2aSXin Li</t>
468*a58d3d2aSXin Li
469*a58d3d2aSXin Li<t>
470*a58d3d2aSXin LiFor example, if the first Opus frame uses the CELT mode, it will always
471*a58d3d2aSXin Li produce 120 samples of windowed overlap-add data.
472*a58d3d2aSXin LiHowever, the overlap data is initially all zeros (since there is no prior
473*a58d3d2aSXin Li frame), meaning this cannot, in general, accurately represent the original
474*a58d3d2aSXin Li audio.
475*a58d3d2aSXin LiThe SILK mode requires additional delay to account for its analysis and
476*a58d3d2aSXin Li resampling latency.
477*a58d3d2aSXin LiThe encoder delays the original audio to avoid this problem.
478*a58d3d2aSXin Li</t>
479*a58d3d2aSXin Li
480*a58d3d2aSXin Li<t>
481*a58d3d2aSXin LiThe pre-skip field MAY also be used to perform sample-accurate cropping of
482*a58d3d2aSXin Li already encoded streams.
483*a58d3d2aSXin LiIn this case, a value of at least 3840&nbsp;samples (80&nbsp;ms) provides
484*a58d3d2aSXin Li sufficient history to the decoder that it will have converged
485*a58d3d2aSXin Li before the stream's output begins.
486*a58d3d2aSXin Li</t>
487*a58d3d2aSXin Li
488*a58d3d2aSXin Li</section>
489*a58d3d2aSXin Li
490*a58d3d2aSXin Li<section anchor="pcm_sample_position" title="PCM Sample Position">
491*a58d3d2aSXin Li<t>
492*a58d3d2aSXin LiThe PCM sample position is determined from the granule position using the
493*a58d3d2aSXin Li formula
494*a58d3d2aSXin Li</t>
495*a58d3d2aSXin Li<figure align="center">
496*a58d3d2aSXin Li<artwork align="center"><![CDATA[
497*a58d3d2aSXin Li'PCM sample position' = 'granule position' - 'pre-skip' .
498*a58d3d2aSXin Li]]></artwork>
499*a58d3d2aSXin Li</figure>
500*a58d3d2aSXin Li
501*a58d3d2aSXin Li<t>
502*a58d3d2aSXin LiFor example, if the granule position of the first audio data page is 59,971,
503*a58d3d2aSXin Li and the pre-skip is 11,971, then the PCM sample position of the last decoded
504*a58d3d2aSXin Li sample from that page is 48,000.
505*a58d3d2aSXin Li</t>
506*a58d3d2aSXin Li<t>
507*a58d3d2aSXin LiThis can be converted into a playback time using the formula
508*a58d3d2aSXin Li</t>
509*a58d3d2aSXin Li<figure align="center">
510*a58d3d2aSXin Li<artwork align="center"><![CDATA[
511*a58d3d2aSXin Li                  'PCM sample position'
512*a58d3d2aSXin Li'playback time' = --------------------- .
513*a58d3d2aSXin Li                         48000.0
514*a58d3d2aSXin Li]]></artwork>
515*a58d3d2aSXin Li</figure>
516*a58d3d2aSXin Li
517*a58d3d2aSXin Li<t>
518*a58d3d2aSXin LiThe initial PCM sample position before any samples are played is normally '0'.
519*a58d3d2aSXin LiIn this case, the PCM sample position of the first audio sample to be played
520*a58d3d2aSXin Li starts at '1', because it marks the time on the clock
521*a58d3d2aSXin Li <spanx style="emph">after</spanx> that sample has been played, and a stream
522*a58d3d2aSXin Li that is exactly one second long has a final PCM sample position of '48000',
523*a58d3d2aSXin Li as in the example here.
524*a58d3d2aSXin Li</t>
525*a58d3d2aSXin Li
526*a58d3d2aSXin Li<t>
527*a58d3d2aSXin LiVorbis streams use a granule position smaller than the number of audio samples
528*a58d3d2aSXin Li contained in the first audio data page to indicate that some of those samples
529*a58d3d2aSXin Li are trimmed from the output (see <xref target="vorbis-trim"/>).
530*a58d3d2aSXin LiHowever, to do so, Vorbis requires that the first audio data page contains
531*a58d3d2aSXin Li exactly two packets, in order to allow the decoder to perform PCM position
532*a58d3d2aSXin Li adjustments before needing to return any PCM data.
533*a58d3d2aSXin LiOpus uses the pre-skip mechanism for this purpose instead, since the encoder
534*a58d3d2aSXin Li might introduce more than a single packet's worth of latency, and since very
535*a58d3d2aSXin Li large packets in streams with a very large number of channels might not fit
536*a58d3d2aSXin Li on a single page.
537*a58d3d2aSXin Li</t>
538*a58d3d2aSXin Li</section>
539*a58d3d2aSXin Li
540*a58d3d2aSXin Li<section anchor="end_trimming" title="End Trimming">
541*a58d3d2aSXin Li<t>
542*a58d3d2aSXin LiThe page with the 'end of stream' flag set MAY have a granule position that
543*a58d3d2aSXin Li indicates the page contains less audio data than would normally be returned by
544*a58d3d2aSXin Li decoding up through the final packet.
545*a58d3d2aSXin LiThis is used to end the stream somewhere other than an even frame boundary.
546*a58d3d2aSXin LiThe granule position of the most recent audio data page with completed packets
547*a58d3d2aSXin Li is used to make this determination, or '0' is used if there were no previous
548*a58d3d2aSXin Li audio data pages with a completed packet.
549*a58d3d2aSXin LiThe difference between these granule positions indicates how many samples to
550*a58d3d2aSXin Li keep after decoding the packets that completed on the final page.
551*a58d3d2aSXin LiThe remaining samples are discarded.
552*a58d3d2aSXin LiThe number of discarded samples SHOULD be no larger than the number decoded
553*a58d3d2aSXin Li from the last packet.
554*a58d3d2aSXin Li</t>
555*a58d3d2aSXin Li</section>
556*a58d3d2aSXin Li
557*a58d3d2aSXin Li<section anchor="start_granpos_restrictions"
558*a58d3d2aSXin Li title="Restrictions on the Initial Granule Position">
559*a58d3d2aSXin Li<t>
560*a58d3d2aSXin LiThe granule position of the first audio data page with a completed packet MAY
561*a58d3d2aSXin Li be larger than the number of samples contained in packets that complete on
562*a58d3d2aSXin Li that page, however it MUST NOT be smaller, unless that page has the 'end of
563*a58d3d2aSXin Li stream' flag set.
564*a58d3d2aSXin LiAllowing a granule position larger than the number of samples allows the
565*a58d3d2aSXin Li beginning of a stream to be cropped or a live stream to be joined without
566*a58d3d2aSXin Li rewriting the granule position of all the remaining pages.
567*a58d3d2aSXin LiThis means that the PCM sample position just before the first sample to be
568*a58d3d2aSXin Li played MAY be larger than '0'.
569*a58d3d2aSXin LiSynchronization when multiplexing with other logical streams still uses the PCM
570*a58d3d2aSXin Li sample position relative to '0' to compute sample times.
571*a58d3d2aSXin LiThis does not affect the behavior of pre-skip: exactly 'pre-skip' samples
572*a58d3d2aSXin Li SHOULD be skipped from the beginning of the decoded output, even if the
573*a58d3d2aSXin Li initial PCM sample position is greater than zero.
574*a58d3d2aSXin Li</t>
575*a58d3d2aSXin Li
576*a58d3d2aSXin Li<t>
577*a58d3d2aSXin LiOn the other hand, a granule position that is smaller than the number of
578*a58d3d2aSXin Li decoded samples prevents a demuxer from working backwards to assign each
579*a58d3d2aSXin Li packet or each individual sample a valid granule position, since granule
580*a58d3d2aSXin Li positions are non-negative.
581*a58d3d2aSXin LiAn implementation MUST treat any stream as invalid if the granule position
582*a58d3d2aSXin Li is smaller than the number of samples contained in packets that complete on
583*a58d3d2aSXin Li the first audio data page with a completed packet, unless that page has the
584*a58d3d2aSXin Li 'end of stream' flag set.
585*a58d3d2aSXin LiIt MAY defer this action until it decodes the last packet completed on that
586*a58d3d2aSXin Li page.
587*a58d3d2aSXin Li</t>
588*a58d3d2aSXin Li
589*a58d3d2aSXin Li<t>
590*a58d3d2aSXin LiIf that page has the 'end of stream' flag set, a demuxer MUST treat any stream
591*a58d3d2aSXin Li as invalid if its granule position is smaller than the 'pre-skip' amount.
592*a58d3d2aSXin LiThis would indicate that there are more samples to be skipped from the initial
593*a58d3d2aSXin Li decoded output than exist in the stream.
594*a58d3d2aSXin LiIf the granule position is smaller than the number of decoded samples produced
595*a58d3d2aSXin Li by the packets that complete on that page, then a demuxer MUST use an initial
596*a58d3d2aSXin Li granule position of '0', and can work forwards from '0' to timestamp
597*a58d3d2aSXin Li individual packets.
598*a58d3d2aSXin LiIf the granule position is larger than the number of decoded samples available,
599*a58d3d2aSXin Li then the demuxer MUST still work backwards as described above, even if the
600*a58d3d2aSXin Li 'end of stream' flag is set, to determine the initial granule position, and
601*a58d3d2aSXin Li thus the initial PCM sample position.
602*a58d3d2aSXin LiBoth of these will be greater than '0' in this case.
603*a58d3d2aSXin Li</t>
604*a58d3d2aSXin Li</section>
605*a58d3d2aSXin Li
606*a58d3d2aSXin Li<section anchor="seeking_and_preroll" title="Seeking and Pre-roll">
607*a58d3d2aSXin Li<t>
608*a58d3d2aSXin LiSeeking in Ogg files is best performed using a bisection search for a page
609*a58d3d2aSXin Li whose granule position corresponds to a PCM position at or before the seek
610*a58d3d2aSXin Li target.
611*a58d3d2aSXin LiWith appropriately weighted bisection, accurate seeking can be performed in
612*a58d3d2aSXin Li just one or two bisections on average, even in multi-gigabyte files.
613*a58d3d2aSXin LiSee <xref target="seeking"/> for an example of general implementation guidance.
614*a58d3d2aSXin Li</t>
615*a58d3d2aSXin Li
616*a58d3d2aSXin Li<t>
617*a58d3d2aSXin LiWhen seeking within an Ogg Opus stream, an implementation SHOULD start decoding
618*a58d3d2aSXin Li (and discarding the output) at least 3840&nbsp;samples (80&nbsp;ms) prior to
619*a58d3d2aSXin Li the seek target in order to ensure that the output audio is correct by the
620*a58d3d2aSXin Li time it reaches the seek target.
621*a58d3d2aSXin LiThis 'pre-roll' is separate from, and unrelated to, the 'pre-skip' used at the
622*a58d3d2aSXin Li beginning of the stream.
623*a58d3d2aSXin LiIf the point 80&nbsp;ms prior to the seek target comes before the initial PCM
624*a58d3d2aSXin Li sample position, an implementation SHOULD start decoding from the beginning of
625*a58d3d2aSXin Li the stream, applying pre-skip as normal, regardless of whether the pre-skip is
626*a58d3d2aSXin Li larger or smaller than 80&nbsp;ms, and then continue to discard samples
627*a58d3d2aSXin Li to reach the seek target (if any).
628*a58d3d2aSXin Li</t>
629*a58d3d2aSXin Li</section>
630*a58d3d2aSXin Li
631*a58d3d2aSXin Li</section>
632*a58d3d2aSXin Li
633*a58d3d2aSXin Li<section anchor="headers" title="Header Packets">
634*a58d3d2aSXin Li<t>
635*a58d3d2aSXin LiAn Ogg Opus logical stream contains exactly two mandatory header packets:
636*a58d3d2aSXin Li an identification header and a comment header.
637*a58d3d2aSXin Li</t>
638*a58d3d2aSXin Li
639*a58d3d2aSXin Li<section anchor="id_header" title="Identification Header">
640*a58d3d2aSXin Li
641*a58d3d2aSXin Li<figure anchor="id_header_packet" title="ID Header Packet" align="center">
642*a58d3d2aSXin Li<artwork align="center"><![CDATA[
643*a58d3d2aSXin Li 0                   1                   2                   3
644*a58d3d2aSXin Li 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
645*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
646*a58d3d2aSXin Li|      'O'      |      'p'      |      'u'      |      's'      |
647*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
648*a58d3d2aSXin Li|      'H'      |      'e'      |      'a'      |      'd'      |
649*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
650*a58d3d2aSXin Li|  Version = 1  | Channel Count |           Pre-skip            |
651*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
652*a58d3d2aSXin Li|                     Input Sample Rate (Hz)                    |
653*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
654*a58d3d2aSXin Li|   Output Gain (Q7.8 in dB)    | Mapping Family|               |
655*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+               :
656*a58d3d2aSXin Li|                                                               |
657*a58d3d2aSXin Li:               Optional Channel Mapping Table...               :
658*a58d3d2aSXin Li|                                                               |
659*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
660*a58d3d2aSXin Li]]></artwork>
661*a58d3d2aSXin Li</figure>
662*a58d3d2aSXin Li
663*a58d3d2aSXin Li<t>
664*a58d3d2aSXin LiThe fields in the identification (ID) header have the following meaning:
665*a58d3d2aSXin Li<list style="numbers">
666*a58d3d2aSXin Li<t>Magic Signature:
667*a58d3d2aSXin Li<vspace blankLines="1"/>
668*a58d3d2aSXin LiThis is an 8-octet (64-bit) field that allows codec identification and is
669*a58d3d2aSXin Li human-readable.
670*a58d3d2aSXin LiIt contains, in order, the magic numbers:
671*a58d3d2aSXin Li<list style="empty">
672*a58d3d2aSXin Li<t>0x4F 'O'</t>
673*a58d3d2aSXin Li<t>0x70 'p'</t>
674*a58d3d2aSXin Li<t>0x75 'u'</t>
675*a58d3d2aSXin Li<t>0x73 's'</t>
676*a58d3d2aSXin Li<t>0x48 'H'</t>
677*a58d3d2aSXin Li<t>0x65 'e'</t>
678*a58d3d2aSXin Li<t>0x61 'a'</t>
679*a58d3d2aSXin Li<t>0x64 'd'</t>
680*a58d3d2aSXin Li</list>
681*a58d3d2aSXin LiStarting with "Op" helps distinguish it from audio data packets, as this is an
682*a58d3d2aSXin Li invalid TOC sequence.
683*a58d3d2aSXin Li<vspace blankLines="1"/>
684*a58d3d2aSXin Li</t>
685*a58d3d2aSXin Li<t>Version (8 bits, unsigned):
686*a58d3d2aSXin Li<vspace blankLines="1"/>
687*a58d3d2aSXin LiThe version number MUST always be '1' for this version of the encapsulation
688*a58d3d2aSXin Li specification.
689*a58d3d2aSXin LiImplementations SHOULD treat streams where the upper four bits of the version
690*a58d3d2aSXin Li number match that of a recognized specification as backwards-compatible with
691*a58d3d2aSXin Li that specification.
692*a58d3d2aSXin LiThat is, the version number can be split into "major" and "minor" version
693*a58d3d2aSXin Li sub-fields, with changes to the "minor" sub-field (in the lower four bits)
694*a58d3d2aSXin Li signaling compatible changes.
695*a58d3d2aSXin LiFor example, an implementation of this specification SHOULD accept any stream
696*a58d3d2aSXin Li with a version number of '15' or less, and SHOULD assume any stream with a
697*a58d3d2aSXin Li version number '16' or greater is incompatible.
698*a58d3d2aSXin LiThe initial version '1' was chosen to keep implementations from relying on this
699*a58d3d2aSXin Li octet as a null terminator for the "OpusHead" string.
700*a58d3d2aSXin Li<vspace blankLines="1"/>
701*a58d3d2aSXin Li</t>
702*a58d3d2aSXin Li<t>Output Channel Count 'C' (8 bits, unsigned):
703*a58d3d2aSXin Li<vspace blankLines="1"/>
704*a58d3d2aSXin LiThis is the number of output channels.
705*a58d3d2aSXin LiThis might be different than the number of encoded channels, which can change
706*a58d3d2aSXin Li on a packet-by-packet basis.
707*a58d3d2aSXin LiThis value MUST NOT be zero.
708*a58d3d2aSXin LiThe maximum allowable value depends on the channel mapping family, and might be
709*a58d3d2aSXin Li as large as 255.
710*a58d3d2aSXin LiSee <xref target="channel_mapping"/> for details.
711*a58d3d2aSXin Li<vspace blankLines="1"/>
712*a58d3d2aSXin Li</t>
713*a58d3d2aSXin Li<t>Pre-skip (16 bits, unsigned, little
714*a58d3d2aSXin Li endian):
715*a58d3d2aSXin Li<vspace blankLines="1"/>
716*a58d3d2aSXin LiThis is the number of samples (at 48&nbsp;kHz) to discard from the decoder
717*a58d3d2aSXin Li output when starting playback, and also the number to subtract from a page's
718*a58d3d2aSXin Li granule position to calculate its PCM sample position.
719*a58d3d2aSXin LiWhen cropping the beginning of existing Ogg Opus streams, a pre-skip of at
720*a58d3d2aSXin Li least 3,840&nbsp;samples (80&nbsp;ms) is RECOMMENDED to ensure complete
721*a58d3d2aSXin Li convergence in the decoder.
722*a58d3d2aSXin Li<vspace blankLines="1"/>
723*a58d3d2aSXin Li</t>
724*a58d3d2aSXin Li<t>Input Sample Rate (32 bits, unsigned, little
725*a58d3d2aSXin Li endian):
726*a58d3d2aSXin Li<vspace blankLines="1"/>
727*a58d3d2aSXin LiThis is the sample rate of the original input (before encoding), in Hz.
728*a58d3d2aSXin LiThis field is <spanx style="emph">not</spanx> the sample rate to use for
729*a58d3d2aSXin Li playback of the encoded data.
730*a58d3d2aSXin Li<vspace blankLines="1"/>
731*a58d3d2aSXin LiOpus can switch between internal audio bandwidths of 4, 6, 8, 12, and
732*a58d3d2aSXin Li 20&nbsp;kHz.
733*a58d3d2aSXin LiEach packet in the stream can have a different audio bandwidth.
734*a58d3d2aSXin LiRegardless of the audio bandwidth, the reference decoder supports decoding any
735*a58d3d2aSXin Li stream at a sample rate of 8, 12, 16, 24, or 48&nbsp;kHz.
736*a58d3d2aSXin LiThe original sample rate of the audio passed to the encoder is not preserved
737*a58d3d2aSXin Li by the lossy compression.
738*a58d3d2aSXin Li<vspace blankLines="1"/>
739*a58d3d2aSXin LiAn Ogg Opus player SHOULD select the playback sample rate according to the
740*a58d3d2aSXin Li following procedure:
741*a58d3d2aSXin Li<list style="numbers">
742*a58d3d2aSXin Li<t>If the hardware supports 48&nbsp;kHz playback, decode at 48&nbsp;kHz.</t>
743*a58d3d2aSXin Li<t>Otherwise, if the hardware's highest available sample rate is a supported
744*a58d3d2aSXin Li rate, decode at this sample rate.</t>
745*a58d3d2aSXin Li<t>Otherwise, if the hardware's highest available sample rate is less than
746*a58d3d2aSXin Li 48&nbsp;kHz, decode at the next higher Opus supported rate above the highest
747*a58d3d2aSXin Li available hardware rate and resample.</t>
748*a58d3d2aSXin Li<t>Otherwise, decode at 48&nbsp;kHz and resample.</t>
749*a58d3d2aSXin Li</list>
750*a58d3d2aSXin LiHowever, the 'Input Sample Rate' field allows the muxer to pass the sample
751*a58d3d2aSXin Li rate of the original input stream as metadata.
752*a58d3d2aSXin LiThis is useful when the user requires the output sample rate to match the
753*a58d3d2aSXin Li input sample rate.
754*a58d3d2aSXin LiFor example, when not playing the output, an implementation writing PCM format
755*a58d3d2aSXin Li samples to disk might choose to resample the audio back to the original input
756*a58d3d2aSXin Li sample rate to reduce surprise to the user, who might reasonably expect to get
757*a58d3d2aSXin Li back a file with the same sample rate.
758*a58d3d2aSXin Li<vspace blankLines="1"/>
759*a58d3d2aSXin LiA value of zero indicates 'unspecified'.
760*a58d3d2aSXin LiMuxers SHOULD write the actual input sample rate or zero, but implementations
761*a58d3d2aSXin Li which do something with this field SHOULD take care to behave sanely if given
762*a58d3d2aSXin Li crazy values (e.g., do not actually upsample the output to 10 MHz if
763*a58d3d2aSXin Li requested).
764*a58d3d2aSXin LiImplementations SHOULD support input sample rates between 8&nbsp;kHz and
765*a58d3d2aSXin Li 192&nbsp;kHz (inclusive).
766*a58d3d2aSXin LiRates outside this range MAY be ignored by falling back to the default rate of
767*a58d3d2aSXin Li 48&nbsp;kHz instead.
768*a58d3d2aSXin Li<vspace blankLines="1"/>
769*a58d3d2aSXin Li</t>
770*a58d3d2aSXin Li<t>Output Gain (16 bits, signed, little endian):
771*a58d3d2aSXin Li<vspace blankLines="1"/>
772*a58d3d2aSXin LiThis is a gain to be applied when decoding.
773*a58d3d2aSXin LiIt is 20*log10 of the factor by which to scale the decoder output to achieve
774*a58d3d2aSXin Li the desired playback volume, stored in a 16-bit, signed, two's complement
775*a58d3d2aSXin Li fixed-point value with 8 fractional bits (i.e.,
776*a58d3d2aSXin Li Q7.8&nbsp;<xref target="q-notation"/>).
777*a58d3d2aSXin Li<vspace blankLines="1"/>
778*a58d3d2aSXin LiTo apply the gain, an implementation could use
779*a58d3d2aSXin Li<figure align="center">
780*a58d3d2aSXin Li<artwork align="center"><![CDATA[
781*a58d3d2aSXin Lisample *= pow(10, output_gain/(20.0*256)) ,
782*a58d3d2aSXin Li]]></artwork>
783*a58d3d2aSXin Li</figure>
784*a58d3d2aSXin Li where output_gain is the raw 16-bit value from the header.
785*a58d3d2aSXin Li<vspace blankLines="1"/>
786*a58d3d2aSXin LiPlayers and media frameworks SHOULD apply it by default.
787*a58d3d2aSXin LiIf a player chooses to apply any volume adjustment or gain modification, such
788*a58d3d2aSXin Li as the R128_TRACK_GAIN (see <xref target="comment_header"/>), the adjustment
789*a58d3d2aSXin Li MUST be applied in addition to this output gain in order to achieve playback
790*a58d3d2aSXin Li at the normalized volume.
791*a58d3d2aSXin Li<vspace blankLines="1"/>
792*a58d3d2aSXin LiA muxer SHOULD set this field to zero, and instead apply any gain prior to
793*a58d3d2aSXin Li encoding, when this is possible and does not conflict with the user's wishes.
794*a58d3d2aSXin LiA nonzero output gain indicates the gain was adjusted after encoding, or that
795*a58d3d2aSXin Li a user wished to adjust the gain for playback while preserving the ability
796*a58d3d2aSXin Li to recover the original signal amplitude.
797*a58d3d2aSXin Li<vspace blankLines="1"/>
798*a58d3d2aSXin LiAlthough the output gain has enormous range (+/- 128 dB, enough to amplify
799*a58d3d2aSXin Li inaudible sounds to the threshold of physical pain), most applications can
800*a58d3d2aSXin Li only reasonably use a small portion of this range around zero.
801*a58d3d2aSXin LiThe large range serves in part to ensure that gain can always be losslessly
802*a58d3d2aSXin Li transferred between OpusHead and R128 gain tags (see below) without
803*a58d3d2aSXin Li saturating.
804*a58d3d2aSXin Li<vspace blankLines="1"/>
805*a58d3d2aSXin Li</t>
806*a58d3d2aSXin Li<t>Channel Mapping Family (8 bits, unsigned):
807*a58d3d2aSXin Li<vspace blankLines="1"/>
808*a58d3d2aSXin LiThis octet indicates the order and semantic meaning of the output channels.
809*a58d3d2aSXin Li<vspace blankLines="1"/>
810*a58d3d2aSXin LiEach currently specified value of this octet indicates a mapping family, which
811*a58d3d2aSXin Li defines a set of allowed channel counts, and the ordered set of channel names
812*a58d3d2aSXin Li for each allowed channel count.
813*a58d3d2aSXin LiThe details are described in <xref target="channel_mapping"/>.
814*a58d3d2aSXin Li</t>
815*a58d3d2aSXin Li<t>Channel Mapping Table:
816*a58d3d2aSXin LiThis table defines the mapping from encoded streams to output channels.
817*a58d3d2aSXin LiIts contents are specified in <xref target="channel_mapping"/>.
818*a58d3d2aSXin Li</t>
819*a58d3d2aSXin Li</list>
820*a58d3d2aSXin Li</t>
821*a58d3d2aSXin Li
822*a58d3d2aSXin Li<t>
823*a58d3d2aSXin LiAll fields in the ID headers are REQUIRED, except for the channel mapping
824*a58d3d2aSXin Li table, which MUST be omitted when the channel mapping family is 0, but
825*a58d3d2aSXin Li is REQUIRED otherwise.
826*a58d3d2aSXin LiImplementations SHOULD treat a stream as invalid if it contains an ID header
827*a58d3d2aSXin Li that does not have enough data for these fields, even if it contain a valid
828*a58d3d2aSXin Li Magic Signature.
829*a58d3d2aSXin LiFuture versions of this specification, even backwards-compatible versions,
830*a58d3d2aSXin Li might include additional fields in the ID header.
831*a58d3d2aSXin LiIf an ID header has a compatible major version, but a larger minor version,
832*a58d3d2aSXin Li an implementation MUST NOT treat it as invalid for containing additional data
833*a58d3d2aSXin Li not specified here, provided it still completes on the first page.
834*a58d3d2aSXin Li</t>
835*a58d3d2aSXin Li
836*a58d3d2aSXin Li<section anchor="channel_mapping" title="Channel Mapping">
837*a58d3d2aSXin Li<t>
838*a58d3d2aSXin LiAn Ogg Opus stream allows mapping one number of Opus streams (N) to a possibly
839*a58d3d2aSXin Li larger number of decoded channels (M&nbsp;+&nbsp;N) to yet another number of
840*a58d3d2aSXin Li output channels (C), which might be larger or smaller than the number of
841*a58d3d2aSXin Li decoded channels.
842*a58d3d2aSXin LiThe order and meaning of these channels are defined by a channel mapping,
843*a58d3d2aSXin Li which consists of the 'channel mapping family' octet and, for channel mapping
844*a58d3d2aSXin Li families other than family&nbsp;0, a channel mapping table, as illustrated in
845*a58d3d2aSXin Li <xref target="channel_mapping_table"/>.
846*a58d3d2aSXin Li</t>
847*a58d3d2aSXin Li
848*a58d3d2aSXin Li<figure anchor="channel_mapping_table" title="Channel Mapping Table"
849*a58d3d2aSXin Li align="center">
850*a58d3d2aSXin Li<artwork align="center"><![CDATA[
851*a58d3d2aSXin Li 0                   1                   2                   3
852*a58d3d2aSXin Li 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
853*a58d3d2aSXin Li                                                +-+-+-+-+-+-+-+-+
854*a58d3d2aSXin Li                                                | Stream Count  |
855*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
856*a58d3d2aSXin Li| Coupled Count |              Channel Mapping...               :
857*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
858*a58d3d2aSXin Li]]></artwork>
859*a58d3d2aSXin Li</figure>
860*a58d3d2aSXin Li
861*a58d3d2aSXin Li<t>
862*a58d3d2aSXin LiThe fields in the channel mapping table have the following meaning:
863*a58d3d2aSXin Li<list style="numbers" counter="8">
864*a58d3d2aSXin Li<t>Stream Count 'N' (8 bits, unsigned):
865*a58d3d2aSXin Li<vspace blankLines="1"/>
866*a58d3d2aSXin LiThis is the total number of streams encoded in each Ogg packet.
867*a58d3d2aSXin LiThis value is necessary to correctly parse the packed Opus packets inside an
868*a58d3d2aSXin Li Ogg packet, as described in <xref target="packet_organization"/>.
869*a58d3d2aSXin LiThis value MUST NOT be zero, as without at least one Opus packet with a valid
870*a58d3d2aSXin Li TOC sequence, a demuxer cannot recover the duration of an Ogg packet.
871*a58d3d2aSXin Li<vspace blankLines="1"/>
872*a58d3d2aSXin LiFor channel mapping family&nbsp;0, this value defaults to 1, and is not coded.
873*a58d3d2aSXin Li<vspace blankLines="1"/>
874*a58d3d2aSXin Li</t>
875*a58d3d2aSXin Li<t>Coupled Stream Count 'M' (8 bits, unsigned):
876*a58d3d2aSXin LiThis is the number of streams whose decoders are to be configured to produce
877*a58d3d2aSXin Li two channels (stereo).
878*a58d3d2aSXin LiThis MUST be no larger than the total number of streams, N.
879*a58d3d2aSXin Li<vspace blankLines="1"/>
880*a58d3d2aSXin LiEach packet in an Opus stream has an internal channel count of 1 or 2, which
881*a58d3d2aSXin Li can change from packet to packet.
882*a58d3d2aSXin LiThis is selected by the encoder depending on the bitrate and the audio being
883*a58d3d2aSXin Li encoded.
884*a58d3d2aSXin LiThe original channel count of the audio passed to the encoder is not
885*a58d3d2aSXin Li necessarily preserved by the lossy compression.
886*a58d3d2aSXin Li<vspace blankLines="1"/>
887*a58d3d2aSXin LiRegardless of the internal channel count, any Opus stream can be decoded as
888*a58d3d2aSXin Li mono (a single channel) or stereo (two channels) by appropriate initialization
889*a58d3d2aSXin Li of the decoder.
890*a58d3d2aSXin LiThe 'coupled stream count' field indicates that the decoders for the first M
891*a58d3d2aSXin Li Opus streams are to be initialized for stereo (two-channel) output, and the
892*a58d3d2aSXin Li remaining (N&nbsp;-&nbsp;M) decoders are to be initialized for mono (a single
893*a58d3d2aSXin Li channel) only.
894*a58d3d2aSXin LiThe total number of decoded channels, (M&nbsp;+&nbsp;N), MUST be no larger than
895*a58d3d2aSXin Li 255, as there is no way to index more channels than that in the channel
896*a58d3d2aSXin Li mapping.
897*a58d3d2aSXin Li<vspace blankLines="1"/>
898*a58d3d2aSXin LiFor channel mapping family&nbsp;0, this value defaults to (C&nbsp;-&nbsp;1)
899*a58d3d2aSXin Li (i.e., 0 for mono and 1 for stereo), and is not coded.
900*a58d3d2aSXin Li<vspace blankLines="1"/>
901*a58d3d2aSXin Li</t>
902*a58d3d2aSXin Li<t>Channel Mapping (8*C bits):
903*a58d3d2aSXin LiThis contains one octet per output channel, indicating which decoded channel
904*a58d3d2aSXin Li is to be used for each one.
905*a58d3d2aSXin LiLet 'index' be the value of this octet for a particular output channel.
906*a58d3d2aSXin LiThis value MUST either be smaller than (M&nbsp;+&nbsp;N), or be the special
907*a58d3d2aSXin Li value 255.
908*a58d3d2aSXin LiIf 'index' is less than 2*M, the output MUST be taken from decoding stream
909*a58d3d2aSXin Li ('index'/2) as stereo and selecting the left channel if 'index' is even, and
910*a58d3d2aSXin Li the right channel if 'index' is odd.
911*a58d3d2aSXin LiIf 'index' is 2*M or larger, but less than 255, the output MUST be taken from
912*a58d3d2aSXin Li decoding stream ('index'&nbsp;-&nbsp;M) as mono.
913*a58d3d2aSXin LiIf 'index' is 255, the corresponding output channel MUST contain pure silence.
914*a58d3d2aSXin Li<vspace blankLines="1"/>
915*a58d3d2aSXin LiThe number of output channels, C, is not constrained to match the number of
916*a58d3d2aSXin Li decoded channels (M&nbsp;+&nbsp;N).
917*a58d3d2aSXin LiA single index value MAY appear multiple times, i.e., the same decoded channel
918*a58d3d2aSXin Li might be mapped to multiple output channels.
919*a58d3d2aSXin LiSome decoded channels might not be assigned to any output channel, as well.
920*a58d3d2aSXin Li<vspace blankLines="1"/>
921*a58d3d2aSXin LiFor channel mapping family&nbsp;0, the first index defaults to 0, and if
922*a58d3d2aSXin Li C&nbsp;==&nbsp;2, the second index defaults to 1.
923*a58d3d2aSXin LiNeither index is coded.
924*a58d3d2aSXin Li</t>
925*a58d3d2aSXin Li</list>
926*a58d3d2aSXin Li</t>
927*a58d3d2aSXin Li
928*a58d3d2aSXin Li<t>
929*a58d3d2aSXin LiAfter producing the output channels, the channel mapping family determines the
930*a58d3d2aSXin Li semantic meaning of each one.
931*a58d3d2aSXin LiThere are three defined mapping families in this specification.
932*a58d3d2aSXin Li</t>
933*a58d3d2aSXin Li
934*a58d3d2aSXin Li<section anchor="channel_mapping_0" title="Channel Mapping Family 0">
935*a58d3d2aSXin Li<t>
936*a58d3d2aSXin LiAllowed numbers of channels: 1 or 2.
937*a58d3d2aSXin LiRTP mapping.
938*a58d3d2aSXin LiThis is the same channel interpretation as <xref target="RFC7587"/>.
939*a58d3d2aSXin Li</t>
940*a58d3d2aSXin Li<t>
941*a58d3d2aSXin Li<list style="symbols">
942*a58d3d2aSXin Li<t>1 channel: monophonic (mono).</t>
943*a58d3d2aSXin Li<t>2 channels: stereo (left, right).</t>
944*a58d3d2aSXin Li</list>
945*a58d3d2aSXin LiSpecial mapping: This channel mapping value also
946*a58d3d2aSXin Li indicates that the contents consists of a single Opus stream that is stereo if
947*a58d3d2aSXin Li and only if C&nbsp;==&nbsp;2, with stream index&nbsp;0 mapped to output
948*a58d3d2aSXin Li channel&nbsp;0 (mono, or left channel) and stream index&nbsp;1 mapped to
949*a58d3d2aSXin Li output channel&nbsp;1 (right channel) if stereo.
950*a58d3d2aSXin LiWhen the 'channel mapping family' octet has this value, the channel mapping
951*a58d3d2aSXin Li table MUST be omitted from the ID header packet.
952*a58d3d2aSXin Li</t>
953*a58d3d2aSXin Li</section>
954*a58d3d2aSXin Li
955*a58d3d2aSXin Li<section anchor="channel_mapping_1" title="Channel Mapping Family 1">
956*a58d3d2aSXin Li<t>
957*a58d3d2aSXin LiAllowed numbers of channels: 1...8.
958*a58d3d2aSXin LiVorbis channel order (see below).
959*a58d3d2aSXin Li</t>
960*a58d3d2aSXin Li<t>
961*a58d3d2aSXin LiEach channel is assigned to a speaker location in a conventional surround
962*a58d3d2aSXin Li arrangement.
963*a58d3d2aSXin LiSpecific locations depend on the number of channels, and are given below
964*a58d3d2aSXin Li in order of the corresponding channel indices.
965*a58d3d2aSXin Li<list style="symbols">
966*a58d3d2aSXin Li  <t>1 channel: monophonic (mono).</t>
967*a58d3d2aSXin Li  <t>2 channels: stereo (left, right).</t>
968*a58d3d2aSXin Li  <t>3 channels: linear surround (left, center, right)</t>
969*a58d3d2aSXin Li  <t>4 channels: quadraphonic (front&nbsp;left, front&nbsp;right, rear&nbsp;left, rear&nbsp;right).</t>
970*a58d3d2aSXin Li  <t>5 channels: 5.0 surround (front&nbsp;left, front&nbsp;center, front&nbsp;right, rear&nbsp;left, rear&nbsp;right).</t>
971*a58d3d2aSXin Li  <t>6 channels: 5.1 surround (front&nbsp;left, front&nbsp;center, front&nbsp;right, rear&nbsp;left, rear&nbsp;right, LFE).</t>
972*a58d3d2aSXin Li  <t>7 channels: 6.1 surround (front&nbsp;left, front&nbsp;center, front&nbsp;right, side&nbsp;left, side&nbsp;right, rear&nbsp;center, LFE).</t>
973*a58d3d2aSXin Li  <t>8 channels: 7.1 surround (front&nbsp;left, front&nbsp;center, front&nbsp;right, side&nbsp;left, side&nbsp;right, rear&nbsp;left, rear&nbsp;right, LFE)</t>
974*a58d3d2aSXin Li</list>
975*a58d3d2aSXin Li</t>
976*a58d3d2aSXin Li<t>
977*a58d3d2aSXin LiThis set of surround options and speaker location orderings is the same
978*a58d3d2aSXin Li as those used by the Vorbis codec <xref target="vorbis-mapping"/>.
979*a58d3d2aSXin LiThe ordering is different from the one used by the
980*a58d3d2aSXin Li WAVE <xref target="wave-multichannel"/> and
981*a58d3d2aSXin Li Free Lossless Audio Codec (FLAC) <xref target="flac"/> formats,
982*a58d3d2aSXin Li so correct ordering requires permutation of the output channels when decoding
983*a58d3d2aSXin Li to or encoding from those formats.
984*a58d3d2aSXin Li'LFE' here refers to a Low Frequency Effects channel, often mapped to a
985*a58d3d2aSXin Li  subwoofer with no particular spatial position.
986*a58d3d2aSXin LiImplementations SHOULD identify 'side' or 'rear' speaker locations with
987*a58d3d2aSXin Li 'surround' and 'back' as appropriate when interfacing with audio formats
988*a58d3d2aSXin Li or systems which prefer that terminology.
989*a58d3d2aSXin Li</t>
990*a58d3d2aSXin Li</section>
991*a58d3d2aSXin Li
992*a58d3d2aSXin Li<section anchor="channel_mapping_255"
993*a58d3d2aSXin Li title="Channel Mapping Family 255">
994*a58d3d2aSXin Li<t>
995*a58d3d2aSXin LiAllowed numbers of channels: 1...255.
996*a58d3d2aSXin LiNo defined channel meaning.
997*a58d3d2aSXin Li</t>
998*a58d3d2aSXin Li<t>
999*a58d3d2aSXin LiChannels are unidentified.
1000*a58d3d2aSXin LiGeneral-purpose players SHOULD NOT attempt to play these streams.
1001*a58d3d2aSXin LiOffline implementations MAY deinterleave the output into separate PCM files,
1002*a58d3d2aSXin Li one per channel.
1003*a58d3d2aSXin LiImplementations SHOULD NOT produce output for channels mapped to stream index
1004*a58d3d2aSXin Li 255 (pure silence) unless they have no other way to indicate the index of
1005*a58d3d2aSXin Li non-silent channels.
1006*a58d3d2aSXin Li</t>
1007*a58d3d2aSXin Li</section>
1008*a58d3d2aSXin Li
1009*a58d3d2aSXin Li<section anchor="channel_mapping_undefined"
1010*a58d3d2aSXin Li title="Undefined Channel Mappings">
1011*a58d3d2aSXin Li<t>
1012*a58d3d2aSXin LiThe remaining channel mapping families (2...254) are reserved.
1013*a58d3d2aSXin LiA demuxer implementation encountering a reserved channel mapping family value
1014*a58d3d2aSXin Li SHOULD act as though the value is 255.
1015*a58d3d2aSXin Li</t>
1016*a58d3d2aSXin Li</section>
1017*a58d3d2aSXin Li
1018*a58d3d2aSXin Li<section anchor="downmix" title="Downmixing">
1019*a58d3d2aSXin Li<t>
1020*a58d3d2aSXin LiAn Ogg Opus player MUST support any valid channel mapping with a channel
1021*a58d3d2aSXin Li mapping family of 0 or 1, even if the number of channels does not match the
1022*a58d3d2aSXin Li physically connected audio hardware.
1023*a58d3d2aSXin LiPlayers SHOULD perform channel mixing to increase or reduce the number of
1024*a58d3d2aSXin Li channels as needed.
1025*a58d3d2aSXin Li</t>
1026*a58d3d2aSXin Li
1027*a58d3d2aSXin Li<t>
1028*a58d3d2aSXin LiImplementations MAY use the matrices in
1029*a58d3d2aSXin Li Figures&nbsp;<xref target="downmix-matrix-3" format="counter"/>
1030*a58d3d2aSXin Li through&nbsp;<xref target="downmix-matrix-8" format="counter"/> to implement
1031*a58d3d2aSXin Li downmixing from multichannel files using
1032*a58d3d2aSXin Li <xref target="channel_mapping_1">Channel Mapping Family 1</xref>, which are
1033*a58d3d2aSXin Li known to give acceptable results for stereo.
1034*a58d3d2aSXin LiMatrices for 3 and 4 channels are normalized so each coefficient row sums
1035*a58d3d2aSXin Li to 1 to avoid clipping.
1036*a58d3d2aSXin LiFor 5 or more channels they are normalized to 2 as a compromise between
1037*a58d3d2aSXin Li clipping and dynamic range reduction.
1038*a58d3d2aSXin Li</t>
1039*a58d3d2aSXin Li<t>
1040*a58d3d2aSXin LiIn these matrices the front left and front right channels are generally
1041*a58d3d2aSXin Lipassed through directly.
1042*a58d3d2aSXin LiWhen a surround channel is split between both the left and right stereo
1043*a58d3d2aSXin Li channels, coefficients are chosen so their squares sum to 1, which
1044*a58d3d2aSXin Li helps preserve the perceived intensity.
1045*a58d3d2aSXin LiRear channels are mixed more diffusely or attenuated to maintain focus
1046*a58d3d2aSXin Li on the front channels.
1047*a58d3d2aSXin Li</t>
1048*a58d3d2aSXin Li
1049*a58d3d2aSXin Li<figure anchor="downmix-matrix-3"
1050*a58d3d2aSXin Li title="Stereo downmix matrix for the linear surround channel mapping"
1051*a58d3d2aSXin Li align="center">
1052*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1053*a58d3d2aSXin LiL output = ( 0.585786 * left + 0.414214 * center                    )
1054*a58d3d2aSXin LiR output = (                   0.414214 * center + 0.585786 * right )
1055*a58d3d2aSXin Li]]></artwork>
1056*a58d3d2aSXin Li<postamble>
1057*a58d3d2aSXin LiExact coefficient values are 1 and 1/sqrt(2), multiplied by
1058*a58d3d2aSXin Li 1/(1&nbsp;+&nbsp;1/sqrt(2)) for normalization.
1059*a58d3d2aSXin Li</postamble>
1060*a58d3d2aSXin Li</figure>
1061*a58d3d2aSXin Li
1062*a58d3d2aSXin Li<figure anchor="downmix-matrix-4"
1063*a58d3d2aSXin Li title="Stereo downmix matrix for the quadraphonic channel mapping"
1064*a58d3d2aSXin Li align="center">
1065*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1066*a58d3d2aSXin Li/          \   /                                     \ / FL \
1067*a58d3d2aSXin Li| L output |   | 0.422650 0.000000 0.366025 0.211325 | | FR |
1068*a58d3d2aSXin Li| R output | = | 0.000000 0.422650 0.211325 0.366025 | | RL |
1069*a58d3d2aSXin Li\          /   \                                     / \ RR /
1070*a58d3d2aSXin Li]]></artwork>
1071*a58d3d2aSXin Li<postamble>
1072*a58d3d2aSXin LiExact coefficient values are 1, sqrt(3)/2 and 1/2, multiplied by
1073*a58d3d2aSXin Li 1/(1&nbsp;+&nbsp;sqrt(3)/2&nbsp;+&nbsp;1/2) for normalization.
1074*a58d3d2aSXin Li</postamble>
1075*a58d3d2aSXin Li</figure>
1076*a58d3d2aSXin Li
1077*a58d3d2aSXin Li<figure anchor="downmix-matrix-5"
1078*a58d3d2aSXin Li title="Stereo downmix matrix for the 5.0 surround mapping"
1079*a58d3d2aSXin Li align="center">
1080*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1081*a58d3d2aSXin Li                                                         / FL \
1082*a58d3d2aSXin Li/   \   /                                              \ | FC |
1083*a58d3d2aSXin Li| L |   | 0.650802 0.460186 0.000000 0.563611 0.325401 | | FR |
1084*a58d3d2aSXin Li| R | = | 0.000000 0.460186 0.650802 0.325401 0.563611 | | RL |
1085*a58d3d2aSXin Li\   /   \                                              / | RR |
1086*a58d3d2aSXin Li                                                         \    /
1087*a58d3d2aSXin Li]]></artwork>
1088*a58d3d2aSXin Li<postamble>
1089*a58d3d2aSXin LiExact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by
1090*a58d3d2aSXin Li 2/(1&nbsp;+&nbsp;1/sqrt(2)&nbsp;+&nbsp;sqrt(3)/2&nbsp;+&nbsp;1/2)
1091*a58d3d2aSXin Li for normalization.
1092*a58d3d2aSXin Li</postamble>
1093*a58d3d2aSXin Li</figure>
1094*a58d3d2aSXin Li
1095*a58d3d2aSXin Li<figure anchor="downmix-matrix-6"
1096*a58d3d2aSXin Li title="Stereo downmix matrix for the 5.1 surround mapping"
1097*a58d3d2aSXin Li align="center">
1098*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1099*a58d3d2aSXin Li                                                                /FL \
1100*a58d3d2aSXin Li/ \   /                                                       \ |FC |
1101*a58d3d2aSXin Li|L|   | 0.529067 0.374107 0.000000 0.458186 0.264534 0.374107 | |FR |
1102*a58d3d2aSXin Li|R| = | 0.000000 0.374107 0.529067 0.264534 0.458186 0.374107 | |RL |
1103*a58d3d2aSXin Li\ /   \                                                       / |RR |
1104*a58d3d2aSXin Li                                                                \LFE/
1105*a58d3d2aSXin Li]]></artwork>
1106*a58d3d2aSXin Li<postamble>
1107*a58d3d2aSXin LiExact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by
1108*a58d3d2aSXin Li2/(1&nbsp;+&nbsp;1/sqrt(2)&nbsp;+&nbsp;sqrt(3)/2&nbsp;+&nbsp;1/2 + 1/sqrt(2))
1109*a58d3d2aSXin Li for normalization.
1110*a58d3d2aSXin Li</postamble>
1111*a58d3d2aSXin Li</figure>
1112*a58d3d2aSXin Li
1113*a58d3d2aSXin Li<figure anchor="downmix-matrix-7"
1114*a58d3d2aSXin Li title="Stereo downmix matrix for the 6.1 surround mapping"
1115*a58d3d2aSXin Li align="center">
1116*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1117*a58d3d2aSXin Li /                                                                \
1118*a58d3d2aSXin Li | 0.455310 0.321953 0.000000 0.394310 0.227655 0.278819 0.321953 |
1119*a58d3d2aSXin Li | 0.000000 0.321953 0.455310 0.227655 0.394310 0.278819 0.321953 |
1120*a58d3d2aSXin Li \                                                                /
1121*a58d3d2aSXin Li]]></artwork>
1122*a58d3d2aSXin Li<postamble>
1123*a58d3d2aSXin LiExact coefficient values are 1, 1/sqrt(2), sqrt(3)/2, 1/2 and
1124*a58d3d2aSXin Li sqrt(3)/2/sqrt(2), multiplied by
1125*a58d3d2aSXin Li 2/(1&nbsp;+&nbsp;1/sqrt(2)&nbsp;+&nbsp;sqrt(3)/2&nbsp;+&nbsp;1/2 +
1126*a58d3d2aSXin Li sqrt(3)/2/sqrt(2) + 1/sqrt(2)) for normalization.
1127*a58d3d2aSXin LiThe coefficients are in the same order as in <xref target="channel_mapping_1" />,
1128*a58d3d2aSXin Li and the matrices above.
1129*a58d3d2aSXin Li</postamble>
1130*a58d3d2aSXin Li</figure>
1131*a58d3d2aSXin Li
1132*a58d3d2aSXin Li<figure anchor="downmix-matrix-8"
1133*a58d3d2aSXin Li title="Stereo downmix matrix for the 7.1 surround mapping"
1134*a58d3d2aSXin Li align="center">
1135*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1136*a58d3d2aSXin Li/                                                                 \
1137*a58d3d2aSXin Li| .388631 .274804 .000000 .336565 .194316 .336565 .194316 .274804 |
1138*a58d3d2aSXin Li| .000000 .274804 .388631 .194316 .336565 .194316 .336565 .274804 |
1139*a58d3d2aSXin Li\                                                                 /
1140*a58d3d2aSXin Li]]></artwork>
1141*a58d3d2aSXin Li<postamble>
1142*a58d3d2aSXin LiExact coefficient values are 1, 1/sqrt(2), sqrt(3)/2 and 1/2, multiplied by
1143*a58d3d2aSXin Li 2/(2&nbsp;+&nbsp;2/sqrt(2)&nbsp;+&nbsp;sqrt(3)) for normalization.
1144*a58d3d2aSXin LiThe coefficients are in the same order as in <xref target="channel_mapping_1" />,
1145*a58d3d2aSXin Li and the matrices above.
1146*a58d3d2aSXin Li</postamble>
1147*a58d3d2aSXin Li</figure>
1148*a58d3d2aSXin Li
1149*a58d3d2aSXin Li</section>
1150*a58d3d2aSXin Li
1151*a58d3d2aSXin Li</section> <!-- end channel_mapping_table -->
1152*a58d3d2aSXin Li
1153*a58d3d2aSXin Li</section> <!-- end id_header -->
1154*a58d3d2aSXin Li
1155*a58d3d2aSXin Li<section anchor="comment_header" title="Comment Header">
1156*a58d3d2aSXin Li
1157*a58d3d2aSXin Li<figure anchor="comment_header_packet" title="Comment Header Packet"
1158*a58d3d2aSXin Li align="center">
1159*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1160*a58d3d2aSXin Li 0                   1                   2                   3
1161*a58d3d2aSXin Li 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1 2 3 4 5 6 7 8 9 0 1
1162*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1163*a58d3d2aSXin Li|      'O'      |      'p'      |      'u'      |      's'      |
1164*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1165*a58d3d2aSXin Li|      'T'      |      'a'      |      'g'      |      's'      |
1166*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1167*a58d3d2aSXin Li|                     Vendor String Length                      |
1168*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1169*a58d3d2aSXin Li|                                                               |
1170*a58d3d2aSXin Li:                        Vendor String...                       :
1171*a58d3d2aSXin Li|                                                               |
1172*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1173*a58d3d2aSXin Li|                   User Comment List Length                    |
1174*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1175*a58d3d2aSXin Li|                 User Comment #0 String Length                 |
1176*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1177*a58d3d2aSXin Li|                                                               |
1178*a58d3d2aSXin Li:                   User Comment #0 String...                   :
1179*a58d3d2aSXin Li|                                                               |
1180*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1181*a58d3d2aSXin Li|                 User Comment #1 String Length                 |
1182*a58d3d2aSXin Li+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+-+
1183*a58d3d2aSXin Li:                                                               :
1184*a58d3d2aSXin Li]]></artwork>
1185*a58d3d2aSXin Li</figure>
1186*a58d3d2aSXin Li
1187*a58d3d2aSXin Li<t>
1188*a58d3d2aSXin LiThe comment header consists of a 64-bit magic signature, followed by data in
1189*a58d3d2aSXin Li the same format as the <xref target="vorbis-comment"/> header used in Ogg
1190*a58d3d2aSXin Li Vorbis, except (like Ogg Theora and Speex) the final "framing bit" specified
1191*a58d3d2aSXin Li in the Vorbis spec is not present.
1192*a58d3d2aSXin Li<list style="numbers">
1193*a58d3d2aSXin Li<t>Magic Signature:
1194*a58d3d2aSXin Li<vspace blankLines="1"/>
1195*a58d3d2aSXin LiThis is an 8-octet (64-bit) field that allows codec identification and is
1196*a58d3d2aSXin Li human-readable.
1197*a58d3d2aSXin LiIt contains, in order, the magic numbers:
1198*a58d3d2aSXin Li<list style="empty">
1199*a58d3d2aSXin Li<t>0x4F 'O'</t>
1200*a58d3d2aSXin Li<t>0x70 'p'</t>
1201*a58d3d2aSXin Li<t>0x75 'u'</t>
1202*a58d3d2aSXin Li<t>0x73 's'</t>
1203*a58d3d2aSXin Li<t>0x54 'T'</t>
1204*a58d3d2aSXin Li<t>0x61 'a'</t>
1205*a58d3d2aSXin Li<t>0x67 'g'</t>
1206*a58d3d2aSXin Li<t>0x73 's'</t>
1207*a58d3d2aSXin Li</list>
1208*a58d3d2aSXin LiStarting with "Op" helps distinguish it from audio data packets, as this is an
1209*a58d3d2aSXin Li invalid TOC sequence.
1210*a58d3d2aSXin Li<vspace blankLines="1"/>
1211*a58d3d2aSXin Li</t>
1212*a58d3d2aSXin Li<t>Vendor String Length (32 bits, unsigned, little endian):
1213*a58d3d2aSXin Li<vspace blankLines="1"/>
1214*a58d3d2aSXin LiThis field gives the length of the following vendor string, in octets.
1215*a58d3d2aSXin LiIt MUST NOT indicate that the vendor string is longer than the rest of the
1216*a58d3d2aSXin Li packet.
1217*a58d3d2aSXin Li<vspace blankLines="1"/>
1218*a58d3d2aSXin Li</t>
1219*a58d3d2aSXin Li<t>Vendor String (variable length, UTF-8 vector):
1220*a58d3d2aSXin Li<vspace blankLines="1"/>
1221*a58d3d2aSXin LiThis is a simple human-readable tag for vendor information, encoded as a UTF-8
1222*a58d3d2aSXin Li string&nbsp;<xref target="RFC3629"/>.
1223*a58d3d2aSXin LiNo terminating null octet is necessary.
1224*a58d3d2aSXin Li<vspace blankLines="1"/>
1225*a58d3d2aSXin LiThis tag is intended to identify the codec encoder and encapsulation
1226*a58d3d2aSXin Li implementations, for tracing differences in technical behavior.
1227*a58d3d2aSXin LiUser-facing applications can use the 'ENCODER' user comment tag to identify
1228*a58d3d2aSXin Li themselves.
1229*a58d3d2aSXin Li<vspace blankLines="1"/>
1230*a58d3d2aSXin Li</t>
1231*a58d3d2aSXin Li<t>User Comment List Length (32 bits, unsigned, little endian):
1232*a58d3d2aSXin Li<vspace blankLines="1"/>
1233*a58d3d2aSXin LiThis field indicates the number of user-supplied comments.
1234*a58d3d2aSXin LiIt MAY indicate there are zero user-supplied comments, in which case there are
1235*a58d3d2aSXin Li no additional fields in the packet.
1236*a58d3d2aSXin LiIt MUST NOT indicate that there are so many comments that the comment string
1237*a58d3d2aSXin Li lengths would require more data than is available in the rest of the packet.
1238*a58d3d2aSXin Li<vspace blankLines="1"/>
1239*a58d3d2aSXin Li</t>
1240*a58d3d2aSXin Li<t>User Comment #i String Length (32 bits, unsigned, little endian):
1241*a58d3d2aSXin Li<vspace blankLines="1"/>
1242*a58d3d2aSXin LiThis field gives the length of the following user comment string, in octets.
1243*a58d3d2aSXin LiThere is one for each user comment indicated by the 'user comment list length'
1244*a58d3d2aSXin Li field.
1245*a58d3d2aSXin LiIt MUST NOT indicate that the string is longer than the rest of the packet.
1246*a58d3d2aSXin Li<vspace blankLines="1"/>
1247*a58d3d2aSXin Li</t>
1248*a58d3d2aSXin Li<t>User Comment #i String (variable length, UTF-8 vector):
1249*a58d3d2aSXin Li<vspace blankLines="1"/>
1250*a58d3d2aSXin LiThis field contains a single user comment encoded as a UTF-8
1251*a58d3d2aSXin Li string&nbsp;<xref target="RFC3629"/>.
1252*a58d3d2aSXin LiThere is one for each user comment indicated by the 'user comment list length'
1253*a58d3d2aSXin Li field.
1254*a58d3d2aSXin Li</t>
1255*a58d3d2aSXin Li</list>
1256*a58d3d2aSXin Li</t>
1257*a58d3d2aSXin Li
1258*a58d3d2aSXin Li<t>
1259*a58d3d2aSXin LiThe vendor string length and user comment list length are REQUIRED, and
1260*a58d3d2aSXin Li implementations SHOULD treat a stream as invalid if it contains a comment
1261*a58d3d2aSXin Li header that does not have enough data for these fields, or that does not
1262*a58d3d2aSXin Li contain enough data for the corresponding vendor string or user comments they
1263*a58d3d2aSXin Li describe.
1264*a58d3d2aSXin LiMaking this check before allocating the associated memory to contain the data
1265*a58d3d2aSXin Li helps prevent a possible Denial-of-Service (DoS) attack from small comment
1266*a58d3d2aSXin Li headers that claim to contain strings longer than the entire packet or more
1267*a58d3d2aSXin Li user comments than than could possibly fit in the packet.
1268*a58d3d2aSXin Li</t>
1269*a58d3d2aSXin Li
1270*a58d3d2aSXin Li<t>
1271*a58d3d2aSXin LiImmediately following the user comment list, the comment header MAY
1272*a58d3d2aSXin Li contain zero-padding or other binary data which is not specified here.
1273*a58d3d2aSXin LiIf the least-significant bit of the first byte of this data is 1, then editors
1274*a58d3d2aSXin Li SHOULD preserve the contents of this data when updating the tags, but if this
1275*a58d3d2aSXin Li bit is 0, all such data MAY be treated as padding, and truncated or discarded
1276*a58d3d2aSXin Li as desired.
1277*a58d3d2aSXin LiThis allows informal experimentation with the format of this binary data until
1278*a58d3d2aSXin Li it can be specified later.
1279*a58d3d2aSXin Li</t>
1280*a58d3d2aSXin Li
1281*a58d3d2aSXin Li<t>
1282*a58d3d2aSXin LiThe comment header can be arbitrarily large and might be spread over a large
1283*a58d3d2aSXin Li number of Ogg pages.
1284*a58d3d2aSXin LiImplementations MUST avoid attempting to allocate excessive amounts of memory
1285*a58d3d2aSXin Li when presented with a very large comment header.
1286*a58d3d2aSXin LiTo accomplish this, implementations MAY treat a stream as invalid if it has a
1287*a58d3d2aSXin Li comment header larger than 125,829,120&nbsp;octets (120&nbsp;MB), and MAY
1288*a58d3d2aSXin Li ignore individual comments that are not fully contained within the first
1289*a58d3d2aSXin Li 61,440&nbsp;octets of the comment header.
1290*a58d3d2aSXin Li</t>
1291*a58d3d2aSXin Li
1292*a58d3d2aSXin Li<section anchor="comment_format" title="Tag Definitions">
1293*a58d3d2aSXin Li<t>
1294*a58d3d2aSXin LiThe user comment strings follow the NAME=value format described by
1295*a58d3d2aSXin Li <xref target="vorbis-comment"/> with the same recommended tag names:
1296*a58d3d2aSXin Li ARTIST, TITLE, DATE, ALBUM, and so on.
1297*a58d3d2aSXin Li</t>
1298*a58d3d2aSXin Li<t>
1299*a58d3d2aSXin LiTwo new comment tags are introduced here:
1300*a58d3d2aSXin Li</t>
1301*a58d3d2aSXin Li
1302*a58d3d2aSXin Li<t>First, an optional gain for track normalization:</t>
1303*a58d3d2aSXin Li<figure align="center">
1304*a58d3d2aSXin Li<artwork align="left"><![CDATA[
1305*a58d3d2aSXin LiR128_TRACK_GAIN=-573
1306*a58d3d2aSXin Li]]></artwork>
1307*a58d3d2aSXin Li</figure>
1308*a58d3d2aSXin Li<t>
1309*a58d3d2aSXin Li representing the volume shift needed to normalize the track's volume
1310*a58d3d2aSXin Li during isolated playback, in random shuffle, and so on.
1311*a58d3d2aSXin LiThe gain is a Q7.8 fixed point number in dB, as in the ID header's 'output
1312*a58d3d2aSXin Li gain' field.
1313*a58d3d2aSXin LiThis tag is similar to the REPLAYGAIN_TRACK_GAIN tag in
1314*a58d3d2aSXin Li Vorbis&nbsp;<xref target="replay-gain"/>, except that the normal volume
1315*a58d3d2aSXin Li reference is the <xref target="EBU-R128"/> standard.
1316*a58d3d2aSXin Li</t>
1317*a58d3d2aSXin Li<t>Second, an optional gain for album normalization:</t>
1318*a58d3d2aSXin Li<figure align="center">
1319*a58d3d2aSXin Li<artwork align="left"><![CDATA[
1320*a58d3d2aSXin LiR128_ALBUM_GAIN=111
1321*a58d3d2aSXin Li]]></artwork>
1322*a58d3d2aSXin Li</figure>
1323*a58d3d2aSXin Li<t>
1324*a58d3d2aSXin Li representing the volume shift needed to normalize the overall volume when
1325*a58d3d2aSXin Li played as part of a particular collection of tracks.
1326*a58d3d2aSXin LiThe gain is also a Q7.8 fixed point number in dB, as in the ID header's
1327*a58d3d2aSXin Li 'output gain' field.
1328*a58d3d2aSXin LiThe values '-573' and '111' given here are just examples.
1329*a58d3d2aSXin Li</t>
1330*a58d3d2aSXin Li<t>
1331*a58d3d2aSXin LiAn Ogg Opus stream MUST NOT have more than one of each of these tags, and if
1332*a58d3d2aSXin Li present their values MUST be an integer from -32768 to 32767, inclusive,
1333*a58d3d2aSXin Li represented in ASCII as a base 10 number with no whitespace.
1334*a58d3d2aSXin LiA leading '+' or '-' character is valid.
1335*a58d3d2aSXin LiLeading zeros are also permitted, but the value MUST be represented by
1336*a58d3d2aSXin Li no more than 6 characters.
1337*a58d3d2aSXin LiOther non-digit characters MUST NOT be present.
1338*a58d3d2aSXin Li</t>
1339*a58d3d2aSXin Li<t>
1340*a58d3d2aSXin LiIf present, R128_TRACK_GAIN and R128_ALBUM_GAIN MUST correctly represent
1341*a58d3d2aSXin Li the R128 normalization gain relative to the 'output gain' field specified
1342*a58d3d2aSXin Li in the ID header.
1343*a58d3d2aSXin LiIf a player chooses to make use of the R128_TRACK_GAIN tag or the
1344*a58d3d2aSXin Li R128_ALBUM_GAIN tag, it MUST apply those gains
1345*a58d3d2aSXin Li <spanx style="emph">in addition</spanx> to the 'output gain' value.
1346*a58d3d2aSXin LiIf a tool modifies the ID header's 'output gain' field, it MUST also update or
1347*a58d3d2aSXin Li remove the R128_TRACK_GAIN and R128_ALBUM_GAIN comment tags if present.
1348*a58d3d2aSXin LiA muxer SHOULD place the gain it wants other tools to use by default into the
1349*a58d3d2aSXin Li 'output gain' field, and not the comment tag.
1350*a58d3d2aSXin Li</t>
1351*a58d3d2aSXin Li<t>
1352*a58d3d2aSXin LiTo avoid confusion with multiple normalization schemes, an Opus comment header
1353*a58d3d2aSXin Li SHOULD NOT contain any of the REPLAYGAIN_TRACK_GAIN, REPLAYGAIN_TRACK_PEAK,
1354*a58d3d2aSXin Li REPLAYGAIN_ALBUM_GAIN, or REPLAYGAIN_ALBUM_PEAK tags, unless they are only
1355*a58d3d2aSXin Li to be used in some context where there is guaranteed to be no such confusion.
1356*a58d3d2aSXin Li<xref target="EBU-R128"/> normalization is preferred to the earlier
1357*a58d3d2aSXin Li REPLAYGAIN schemes because of its clear definition and adoption by industry.
1358*a58d3d2aSXin LiPeak normalizations are difficult to calculate reliably for lossy codecs
1359*a58d3d2aSXin Li because of variation in excursion heights due to decoder differences.
1360*a58d3d2aSXin LiIn the authors' investigations they were not applied consistently or broadly
1361*a58d3d2aSXin Li enough to merit inclusion here.
1362*a58d3d2aSXin Li</t>
1363*a58d3d2aSXin Li</section> <!-- end comment_format -->
1364*a58d3d2aSXin Li</section> <!-- end comment_header -->
1365*a58d3d2aSXin Li
1366*a58d3d2aSXin Li</section> <!-- end headers -->
1367*a58d3d2aSXin Li
1368*a58d3d2aSXin Li<section anchor="packet_size_limits" title="Packet Size Limits">
1369*a58d3d2aSXin Li<t>
1370*a58d3d2aSXin LiTechnically, valid Opus packets can be arbitrarily large due to the padding
1371*a58d3d2aSXin Li format, although the amount of non-padding data they can contain is bounded.
1372*a58d3d2aSXin LiThese packets might be spread over a similarly enormous number of Ogg pages.
1373*a58d3d2aSXin LiWhen encoding, implementations SHOULD limit the use of padding in audio data
1374*a58d3d2aSXin Li packets to no more than is necessary to make a variable bitrate (VBR) stream
1375*a58d3d2aSXin Li constant bitrate (CBR), unless they have no reasonable way to determine what
1376*a58d3d2aSXin Li is necessary.
1377*a58d3d2aSXin LiDemuxers SHOULD treat audio data packets as invalid (treat them as if they were
1378*a58d3d2aSXin Li malformed Opus packets with an invalid TOC sequence) if they are larger than
1379*a58d3d2aSXin Li 61,440&nbsp;octets per Opus stream, unless they have a specific reason for
1380*a58d3d2aSXin Li allowing extra padding.
1381*a58d3d2aSXin LiSuch packets necessarily contain more padding than needed to make a stream CBR.
1382*a58d3d2aSXin LiDemuxers MUST avoid attempting to allocate excessive amounts of memory when
1383*a58d3d2aSXin Li presented with a very large packet.
1384*a58d3d2aSXin LiDemuxers MAY treat audio data packets as invalid or partially process them if
1385*a58d3d2aSXin Li they are larger than 61,440&nbsp;octets in an Ogg Opus stream with channel
1386*a58d3d2aSXin Li mapping families&nbsp;0 or&nbsp;1.
1387*a58d3d2aSXin LiDemuxers MAY treat audio data packets as invalid or partially process them in
1388*a58d3d2aSXin Li any Ogg Opus stream if the packet is larger than 61,440&nbsp;octets and also
1389*a58d3d2aSXin Li larger than 7,680&nbsp;octets per Opus stream.
1390*a58d3d2aSXin LiThe presence of an extremely large packet in the stream could indicate a
1391*a58d3d2aSXin Li memory exhaustion attack or stream corruption.
1392*a58d3d2aSXin Li</t>
1393*a58d3d2aSXin Li<t>
1394*a58d3d2aSXin LiIn an Ogg Opus stream, the largest possible valid packet that does not use
1395*a58d3d2aSXin Li padding has a size of (61,298*N&nbsp;-&nbsp;2) octets.
1396*a58d3d2aSXin LiWith 255&nbsp;streams, this is 15,630,988&nbsp;octets and can
1397*a58d3d2aSXin Li span up to 61,298&nbsp;Ogg pages, all but one of which will have a granule
1398*a58d3d2aSXin Li position of -1.
1399*a58d3d2aSXin LiThis is of course a very extreme packet, consisting of 255&nbsp;streams, each
1400*a58d3d2aSXin Li containing 120&nbsp;ms of audio encoded as 2.5&nbsp;ms frames, each frame
1401*a58d3d2aSXin Li using the maximum possible number of octets (1275) and stored in the least
1402*a58d3d2aSXin Li efficient manner allowed (a VBR code&nbsp;3 Opus packet).
1403*a58d3d2aSXin LiEven in such a packet, most of the data will be zeros as 2.5&nbsp;ms frames
1404*a58d3d2aSXin Li cannot actually use all 1275&nbsp;octets.
1405*a58d3d2aSXin Li</t>
1406*a58d3d2aSXin Li<t>
1407*a58d3d2aSXin LiThe largest packet consisting of entirely useful data is
1408*a58d3d2aSXin Li (15,326*N&nbsp;-&nbsp;2) octets.
1409*a58d3d2aSXin LiThis corresponds to 120&nbsp;ms of audio encoded as 10&nbsp;ms frames in either
1410*a58d3d2aSXin Li SILK or Hybrid mode, but at a data rate of over 1&nbsp;Mbps, which makes little
1411*a58d3d2aSXin Li sense for the quality achieved.
1412*a58d3d2aSXin Li</t>
1413*a58d3d2aSXin Li<t>
1414*a58d3d2aSXin LiA more reasonable limit is (7,664*N&nbsp;-&nbsp;2) octets.
1415*a58d3d2aSXin LiThis corresponds to 120&nbsp;ms of audio encoded as 20&nbsp;ms stereo CELT mode
1416*a58d3d2aSXin Li frames, with a total bitrate just under 511&nbsp;kbps (not counting the Ogg
1417*a58d3d2aSXin Li encapsulation overhead).
1418*a58d3d2aSXin LiFor channel mapping family 1, N=8 provides a reasonable upper bound, as it
1419*a58d3d2aSXin Li allows for each of the 8 possible output channels to be decoded from a
1420*a58d3d2aSXin Li separate stereo Opus stream.
1421*a58d3d2aSXin LiThis gives a size of 61,310&nbsp;octets, which is rounded up to a multiple of
1422*a58d3d2aSXin Li 1,024&nbsp;octets to yield the audio data packet size of 61,440&nbsp;octets
1423*a58d3d2aSXin Li that any implementation is expected to be able to process successfully.
1424*a58d3d2aSXin Li</t>
1425*a58d3d2aSXin Li</section>
1426*a58d3d2aSXin Li
1427*a58d3d2aSXin Li<section anchor="encoder" title="Encoder Guidelines">
1428*a58d3d2aSXin Li<t>
1429*a58d3d2aSXin LiWhen encoding Opus streams, Ogg muxers SHOULD take into account the
1430*a58d3d2aSXin Li algorithmic delay of the Opus encoder.
1431*a58d3d2aSXin Li</t>
1432*a58d3d2aSXin Li<t>
1433*a58d3d2aSXin LiIn encoders derived from the reference
1434*a58d3d2aSXin Li implementation&nbsp;<xref target="RFC6716"/>, the number of samples can be
1435*a58d3d2aSXin Li queried with:
1436*a58d3d2aSXin Li</t>
1437*a58d3d2aSXin Li<figure align="center">
1438*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1439*a58d3d2aSXin Li opus_encoder_ctl(encoder_state, OPUS_GET_LOOKAHEAD(&delay_samples));
1440*a58d3d2aSXin Li]]></artwork>
1441*a58d3d2aSXin Li</figure>
1442*a58d3d2aSXin Li<t>
1443*a58d3d2aSXin LiTo achieve good quality in the very first samples of a stream, implementations
1444*a58d3d2aSXin Li MAY use linear predictive coding (LPC) extrapolation to generate at least 120
1445*a58d3d2aSXin Li extra samples at the beginning to avoid the Opus encoder having to encode a
1446*a58d3d2aSXin Li discontinuous signal.
1447*a58d3d2aSXin LiFor more information on linear prediction, see
1448*a58d3d2aSXin Li <xref target="linear-prediction"/>.
1449*a58d3d2aSXin LiFor an input file containing 'length' samples, the implementation SHOULD set
1450*a58d3d2aSXin Li the pre-skip header value to (delay_samples&nbsp;+&nbsp;extra_samples), encode
1451*a58d3d2aSXin Li at least (length&nbsp;+&nbsp;delay_samples&nbsp;+&nbsp;extra_samples)
1452*a58d3d2aSXin Li samples, and set the granule position of the last page to
1453*a58d3d2aSXin Li (length&nbsp;+&nbsp;delay_samples&nbsp;+&nbsp;extra_samples).
1454*a58d3d2aSXin LiThis ensures that the encoded file has the same duration as the original, with
1455*a58d3d2aSXin Li no time offset. The best way to pad the end of the stream is to also use LPC
1456*a58d3d2aSXin Li extrapolation, but zero-padding is also acceptable.
1457*a58d3d2aSXin Li</t>
1458*a58d3d2aSXin Li
1459*a58d3d2aSXin Li<section anchor="lpc" title="LPC Extrapolation">
1460*a58d3d2aSXin Li<t>
1461*a58d3d2aSXin LiThe first step in LPC extrapolation is to compute linear prediction
1462*a58d3d2aSXin Li coefficients. <xref target="lpc-sample"/>
1463*a58d3d2aSXin LiWhen extending the end of the signal, order-N (typically with N ranging from 8
1464*a58d3d2aSXin Li to 40) LPC analysis is performed on a window near the end of the signal.
1465*a58d3d2aSXin LiThe last N samples are used as memory to an infinite impulse response (IIR)
1466*a58d3d2aSXin Li filter.
1467*a58d3d2aSXin Li</t>
1468*a58d3d2aSXin Li<t>
1469*a58d3d2aSXin LiThe filter is then applied on a zero input to extrapolate the end of the signal.
1470*a58d3d2aSXin LiLet a(k) be the kth LPC coefficient and x(n) be the nth sample of the signal,
1471*a58d3d2aSXin Li each new sample past the end of the signal is computed as:
1472*a58d3d2aSXin Li</t>
1473*a58d3d2aSXin Li<figure align="center">
1474*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1475*a58d3d2aSXin Li        N
1476*a58d3d2aSXin Li       ---
1477*a58d3d2aSXin Lix(n) = \   a(k)*x(n-k)
1478*a58d3d2aSXin Li       /
1479*a58d3d2aSXin Li       ---
1480*a58d3d2aSXin Li       k=1
1481*a58d3d2aSXin Li]]></artwork>
1482*a58d3d2aSXin Li</figure>
1483*a58d3d2aSXin Li<t>
1484*a58d3d2aSXin LiThe process is repeated independently for each channel.
1485*a58d3d2aSXin LiIt is possible to extend the beginning of the signal by applying the same
1486*a58d3d2aSXin Li process backward in time.
1487*a58d3d2aSXin LiWhen extending the beginning of the signal, it is best to apply a "fade in" to
1488*a58d3d2aSXin Li the extrapolated signal, e.g. by multiplying it by a half-Hanning window
1489*a58d3d2aSXin Li <xref target="hanning"/>.
1490*a58d3d2aSXin Li</t>
1491*a58d3d2aSXin Li
1492*a58d3d2aSXin Li</section>
1493*a58d3d2aSXin Li
1494*a58d3d2aSXin Li<section anchor="continuous_chaining" title="Continuous Chaining">
1495*a58d3d2aSXin Li<t>
1496*a58d3d2aSXin LiIn some applications, such as Internet radio, it is desirable to cut a long
1497*a58d3d2aSXin Li stream into smaller chains, e.g. so the comment header can be updated.
1498*a58d3d2aSXin LiThis can be done simply by separating the input streams into segments and
1499*a58d3d2aSXin Li encoding each segment independently.
1500*a58d3d2aSXin LiThe drawback of this approach is that it creates a small discontinuity
1501*a58d3d2aSXin Li at the boundary due to the lossy nature of Opus.
1502*a58d3d2aSXin LiA muxer MAY avoid this discontinuity by using the following procedure:
1503*a58d3d2aSXin Li<list style="numbers">
1504*a58d3d2aSXin Li<t>Encode the last frame of the first segment as an independent frame by
1505*a58d3d2aSXin Li turning off all forms of inter-frame prediction.
1506*a58d3d2aSXin LiDe-emphasis is allowed.</t>
1507*a58d3d2aSXin Li<t>Set the granule position of the last page to a point near the end of the
1508*a58d3d2aSXin Li last frame.</t>
1509*a58d3d2aSXin Li<t>Begin the second segment with a copy of the last frame of the first
1510*a58d3d2aSXin Li segment.</t>
1511*a58d3d2aSXin Li<t>Set the pre-skip value of the second stream in such a way as to properly
1512*a58d3d2aSXin Li join the two streams.</t>
1513*a58d3d2aSXin Li<t>Continue the encoding process normally from there, without any reset to
1514*a58d3d2aSXin Li the encoder.</t>
1515*a58d3d2aSXin Li</list>
1516*a58d3d2aSXin Li</t>
1517*a58d3d2aSXin Li<t>
1518*a58d3d2aSXin LiIn encoders derived from the reference implementation, inter-frame prediction
1519*a58d3d2aSXin Li can be turned off by calling:
1520*a58d3d2aSXin Li</t>
1521*a58d3d2aSXin Li<figure align="center">
1522*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1523*a58d3d2aSXin Li opus_encoder_ctl(encoder_state, OPUS_SET_PREDICTION_DISABLED(1));
1524*a58d3d2aSXin Li]]></artwork>
1525*a58d3d2aSXin Li</figure>
1526*a58d3d2aSXin Li<t>
1527*a58d3d2aSXin LiFor best results, this implementation requires that prediction be explicitly
1528*a58d3d2aSXin Li enabled again before resuming normal encoding, even after a reset.
1529*a58d3d2aSXin Li</t>
1530*a58d3d2aSXin Li
1531*a58d3d2aSXin Li</section>
1532*a58d3d2aSXin Li
1533*a58d3d2aSXin Li</section>
1534*a58d3d2aSXin Li
1535*a58d3d2aSXin Li<section anchor="implementation" title="Implementation Status">
1536*a58d3d2aSXin Li<t>
1537*a58d3d2aSXin LiA brief summary of major implementations of this draft is available
1538*a58d3d2aSXin Li at <eref target="https://wiki.xiph.org/OggOpusImplementation"/>,
1539*a58d3d2aSXin Li along with their status.
1540*a58d3d2aSXin Li</t>
1541*a58d3d2aSXin Li<t>
1542*a58d3d2aSXin Li[Note to RFC Editor: please remove this entire section before
1543*a58d3d2aSXin Li final publication per <xref target="RFC6982"/>, along with
1544*a58d3d2aSXin Li its references.]
1545*a58d3d2aSXin Li</t>
1546*a58d3d2aSXin Li</section>
1547*a58d3d2aSXin Li
1548*a58d3d2aSXin Li<section anchor="security" title="Security Considerations">
1549*a58d3d2aSXin Li<t>
1550*a58d3d2aSXin LiImplementations of the Opus codec need to take appropriate security
1551*a58d3d2aSXin Li considerations into account, as outlined in <xref target="RFC4732"/>.
1552*a58d3d2aSXin LiThis is just as much a problem for the container as it is for the codec itself.
1553*a58d3d2aSXin LiMalicious payloads and/or input streams can be used to attack codec
1554*a58d3d2aSXin Li implementations.
1555*a58d3d2aSXin LiImplementations MUST NOT overrun their allocated memory nor consume excessive
1556*a58d3d2aSXin Li resources when decoding payloads or processing input streams.
1557*a58d3d2aSXin LiAlthough problems in encoding applications are typically rarer, this still
1558*a58d3d2aSXin Li applies to a muxer, as vulnerabilities would allow an attacker to attack
1559*a58d3d2aSXin Li transcoding gateways.
1560*a58d3d2aSXin Li</t>
1561*a58d3d2aSXin Li
1562*a58d3d2aSXin Li<t>
1563*a58d3d2aSXin LiHeader parsing code contains the most likely area for potential overruns.
1564*a58d3d2aSXin LiIt is important for implementations to ensure their buffers contain enough
1565*a58d3d2aSXin Li data for all of the required fields before attempting to read it (for example,
1566*a58d3d2aSXin Li for all of the channel map data in the ID header).
1567*a58d3d2aSXin LiImplementations would do well to validate the indices of the channel map, also,
1568*a58d3d2aSXin Li to ensure they meet all of the restrictions outlined in
1569*a58d3d2aSXin Li <xref target="channel_mapping"/>, in order to avoid attempting to read data
1570*a58d3d2aSXin Li from channels that do not exist.
1571*a58d3d2aSXin Li</t>
1572*a58d3d2aSXin Li
1573*a58d3d2aSXin Li<t>
1574*a58d3d2aSXin LiTo avoid excessive resource usage, we advise implementations to be especially
1575*a58d3d2aSXin Li wary of streams that might cause them to process far more data than was
1576*a58d3d2aSXin Li actually transmitted.
1577*a58d3d2aSXin LiFor example, a relatively small comment header may contain values for the
1578*a58d3d2aSXin Li string lengths or user comment list length that imply that it is many
1579*a58d3d2aSXin Li gigabytes in size.
1580*a58d3d2aSXin LiEven computing the size of the required buffer could overflow a 32-bit integer,
1581*a58d3d2aSXin Li and actually attempting to allocate such a buffer before verifying it would be
1582*a58d3d2aSXin Li a reasonable size is a bad idea.
1583*a58d3d2aSXin LiAfter reading the user comment list length, implementations might wish to
1584*a58d3d2aSXin Li verify that the header contains at least the minimum amount of data for that
1585*a58d3d2aSXin Li many comments (4&nbsp;additional octets per comment, to indicate each has a
1586*a58d3d2aSXin Li length of zero) before proceeding any further, again taking care to avoid
1587*a58d3d2aSXin Li overflow in these calculations.
1588*a58d3d2aSXin LiIf allocating an array of pointers to point at these strings, the size of the
1589*a58d3d2aSXin Li pointers may be larger than 4&nbsp;octets, potentially requiring a separate
1590*a58d3d2aSXin Li overflow check.
1591*a58d3d2aSXin Li</t>
1592*a58d3d2aSXin Li
1593*a58d3d2aSXin Li<t>
1594*a58d3d2aSXin LiAnother bug in this class we have observed more than once involves the handling
1595*a58d3d2aSXin Li of invalid data at the end of a stream.
1596*a58d3d2aSXin LiOften, implementations will seek to the end of a stream to locate the last
1597*a58d3d2aSXin Li timestamp in order to compute its total duration.
1598*a58d3d2aSXin LiIf they do not find a valid capture pattern and Ogg page from the desired
1599*a58d3d2aSXin Li logical stream, they will back up and try again.
1600*a58d3d2aSXin LiIf care is not taken to avoid re-scanning data that was already scanned, this
1601*a58d3d2aSXin Li search can quickly devolve into something with a complexity that is quadratic
1602*a58d3d2aSXin Li in the amount of invalid data.
1603*a58d3d2aSXin Li</t>
1604*a58d3d2aSXin Li
1605*a58d3d2aSXin Li<t>
1606*a58d3d2aSXin LiIn general when seeking, implementations will wish to be cautious about the
1607*a58d3d2aSXin Li effects of invalid granule position values, and ensure all algorithms will
1608*a58d3d2aSXin Li continue to make progress and eventually terminate, even if these are missing
1609*a58d3d2aSXin Li or out-of-order.
1610*a58d3d2aSXin Li</t>
1611*a58d3d2aSXin Li
1612*a58d3d2aSXin Li<t>
1613*a58d3d2aSXin LiLike most other container formats, Ogg Opus streams SHOULD NOT be used with
1614*a58d3d2aSXin Li insecure ciphers or cipher modes that are vulnerable to known-plaintext
1615*a58d3d2aSXin Li attacks.
1616*a58d3d2aSXin LiElements such as the Ogg page capture pattern and the magic signatures in the
1617*a58d3d2aSXin Li ID header and the comment header all have easily predictable values, in
1618*a58d3d2aSXin Li addition to various elements of the codec data itself.
1619*a58d3d2aSXin Li</t>
1620*a58d3d2aSXin Li</section>
1621*a58d3d2aSXin Li
1622*a58d3d2aSXin Li<section anchor="content_type" title="Content Type">
1623*a58d3d2aSXin Li<t>
1624*a58d3d2aSXin LiAn "Ogg Opus file" consists of one or more sequentially multiplexed segments,
1625*a58d3d2aSXin Li each containing exactly one Ogg Opus stream.
1626*a58d3d2aSXin LiThe RECOMMENDED mime-type for Ogg Opus files is "audio/ogg".
1627*a58d3d2aSXin Li</t>
1628*a58d3d2aSXin Li
1629*a58d3d2aSXin Li<t>
1630*a58d3d2aSXin LiIf more specificity is desired, one MAY indicate the presence of Opus streams
1631*a58d3d2aSXin Li using the codecs parameter defined in <xref target="RFC6381"/> and
1632*a58d3d2aSXin Li <xref target="RFC5334"/>, e.g.,
1633*a58d3d2aSXin Li</t>
1634*a58d3d2aSXin Li<figure>
1635*a58d3d2aSXin Li<artwork align="center"><![CDATA[
1636*a58d3d2aSXin Li    audio/ogg; codecs=opus
1637*a58d3d2aSXin Li]]></artwork>
1638*a58d3d2aSXin Li</figure>
1639*a58d3d2aSXin Li<t>
1640*a58d3d2aSXin Li for an Ogg Opus file.
1641*a58d3d2aSXin Li</t>
1642*a58d3d2aSXin Li
1643*a58d3d2aSXin Li<t>
1644*a58d3d2aSXin LiThe RECOMMENDED filename extension for Ogg Opus files is '.opus'.
1645*a58d3d2aSXin Li</t>
1646*a58d3d2aSXin Li
1647*a58d3d2aSXin Li<t>
1648*a58d3d2aSXin LiWhen Opus is concurrently multiplexed with other streams in an Ogg container,
1649*a58d3d2aSXin Li one SHOULD use one of the "audio/ogg", "video/ogg", or "application/ogg"
1650*a58d3d2aSXin Li mime-types, as defined in <xref target="RFC5334"/>.
1651*a58d3d2aSXin LiSuch streams are not strictly "Ogg Opus files" as described above,
1652*a58d3d2aSXin Li since they contain more than a single Opus stream per sequentially
1653*a58d3d2aSXin Li multiplexed segment, e.g. video or multiple audio tracks.
1654*a58d3d2aSXin LiIn such cases the the '.opus' filename extension is NOT RECOMMENDED.
1655*a58d3d2aSXin Li</t>
1656*a58d3d2aSXin Li
1657*a58d3d2aSXin Li<t>
1658*a58d3d2aSXin LiIn either case, this document updates <xref target="RFC5334"/>
1659*a58d3d2aSXin Li to add 'opus' as a codecs parameter value with char[8]: 'OpusHead'
1660*a58d3d2aSXin Li as Codec Identifier.
1661*a58d3d2aSXin Li</t>
1662*a58d3d2aSXin Li</section>
1663*a58d3d2aSXin Li
1664*a58d3d2aSXin Li<section anchor="iana" title="IANA Considerations">
1665*a58d3d2aSXin Li<t>
1666*a58d3d2aSXin LiThis document updates the IANA Media Types registry to add .opus
1667*a58d3d2aSXin Li as a file extension for "audio/ogg", and to add itself as a reference
1668*a58d3d2aSXin Li alongside <xref target="RFC5334"/> for "audio/ogg", "video/ogg", and
1669*a58d3d2aSXin Li "application/ogg" Media Types.
1670*a58d3d2aSXin Li</t>
1671*a58d3d2aSXin Li<t>
1672*a58d3d2aSXin LiThis document defines a new registry "Opus Channel Mapping Families" to
1673*a58d3d2aSXin Li indicate how the semantic meanings of the channels in a multi-channel Opus
1674*a58d3d2aSXin Li stream are described.
1675*a58d3d2aSXin LiIANA is requested to create a new name space of "Opus Channel Mapping
1676*a58d3d2aSXin Li Families".
1677*a58d3d2aSXin LiThis will be a new registry on the IANA Matrix, and not a subregistry of an
1678*a58d3d2aSXin Li existing registry.
1679*a58d3d2aSXin LiModifications to this registry follow the "Specification Required" registration
1680*a58d3d2aSXin Li policy as defined in <xref target="RFC5226"/>.
1681*a58d3d2aSXin LiEach registry entry consists of a Channel Mapping Family Number, which is
1682*a58d3d2aSXin Li specified in decimal in the range 0 to 255, inclusive, and a Reference (or
1683*a58d3d2aSXin Li list of references)
1684*a58d3d2aSXin LiEach Reference must point to sufficient documentation to describe what
1685*a58d3d2aSXin Li information is coded in the Opus identification header for this channel
1686*a58d3d2aSXin Li mapping family, how a demuxer determines the Stream Count ('N') and Coupled
1687*a58d3d2aSXin Li Stream Count ('M') from this information, and how it determines the proper
1688*a58d3d2aSXin Li interpretation of each of the decoded channels.
1689*a58d3d2aSXin Li</t>
1690*a58d3d2aSXin Li<t>
1691*a58d3d2aSXin LiThis document defines three initial assignments for this registry.
1692*a58d3d2aSXin Li</t>
1693*a58d3d2aSXin Li<texttable>
1694*a58d3d2aSXin Li<ttcol>Value</ttcol><ttcol>Reference</ttcol>
1695*a58d3d2aSXin Li<c>0</c><c>[RFCXXXX] <xref target="channel_mapping_0"/></c>
1696*a58d3d2aSXin Li<c>1</c><c>[RFCXXXX] <xref target="channel_mapping_1"/></c>
1697*a58d3d2aSXin Li<c>255</c><c>[RFCXXXX] <xref target="channel_mapping_255"/></c>
1698*a58d3d2aSXin Li</texttable>
1699*a58d3d2aSXin Li<t>
1700*a58d3d2aSXin LiThe designated expert will determine if the Reference points to a specification
1701*a58d3d2aSXin Li that meets the requirements for permanence and ready availability laid out
1702*a58d3d2aSXin Li in&nbsp;<xref target="RFC5226"/> and that it specifies the information
1703*a58d3d2aSXin Li described above with sufficient clarity to allow interoperable
1704*a58d3d2aSXin Li implementations.
1705*a58d3d2aSXin Li</t>
1706*a58d3d2aSXin Li</section>
1707*a58d3d2aSXin Li
1708*a58d3d2aSXin Li<section anchor="Acknowledgments" title="Acknowledgments">
1709*a58d3d2aSXin Li<t>
1710*a58d3d2aSXin LiThanks to Ben Campbell, Joel M. Halpern, Mark Harris, Greg Maxwell,
1711*a58d3d2aSXin Li Christopher "Monty" Montgomery, Jean-Marc Valin, Stephan Wenger, and Mo Zanaty
1712*a58d3d2aSXin Li for their valuable contributions to this document.
1713*a58d3d2aSXin LiAdditional thanks to Andrew D'Addesio, Greg Maxwell, and Vincent Penquerc'h for
1714*a58d3d2aSXin Li their feedback based on early implementations.
1715*a58d3d2aSXin Li</t>
1716*a58d3d2aSXin Li</section>
1717*a58d3d2aSXin Li
1718*a58d3d2aSXin Li<section title="RFC Editor Notes">
1719*a58d3d2aSXin Li<t>
1720*a58d3d2aSXin LiIn&nbsp;<xref target="iana"/>, "RFCXXXX" is to be replaced with the RFC number
1721*a58d3d2aSXin Li assigned to this draft.
1722*a58d3d2aSXin Li</t>
1723*a58d3d2aSXin Li</section>
1724*a58d3d2aSXin Li
1725*a58d3d2aSXin Li</middle>
1726*a58d3d2aSXin Li<back>
1727*a58d3d2aSXin Li<references title="Normative References">
1728*a58d3d2aSXin Li &rfc2119;
1729*a58d3d2aSXin Li &rfc3533;
1730*a58d3d2aSXin Li &rfc3629;
1731*a58d3d2aSXin Li &rfc5226;
1732*a58d3d2aSXin Li &rfc5334;
1733*a58d3d2aSXin Li &rfc6381;
1734*a58d3d2aSXin Li &rfc6716;
1735*a58d3d2aSXin Li
1736*a58d3d2aSXin Li<reference anchor="EBU-R128" target="https://tech.ebu.ch/loudness">
1737*a58d3d2aSXin Li<front>
1738*a58d3d2aSXin Li  <title>Loudness Recommendation EBU R128</title>
1739*a58d3d2aSXin Li  <author>
1740*a58d3d2aSXin Li    <organization>EBU Technical Committee</organization>
1741*a58d3d2aSXin Li  </author>
1742*a58d3d2aSXin Li  <date month="August" year="2011"/>
1743*a58d3d2aSXin Li</front>
1744*a58d3d2aSXin Li</reference>
1745*a58d3d2aSXin Li
1746*a58d3d2aSXin Li<reference anchor="vorbis-comment"
1747*a58d3d2aSXin Li target="https://www.xiph.org/vorbis/doc/v-comment.html">
1748*a58d3d2aSXin Li<front>
1749*a58d3d2aSXin Li<title>Ogg Vorbis I Format Specification: Comment Field and Header
1750*a58d3d2aSXin Li Specification</title>
1751*a58d3d2aSXin Li<author initials="C." surname="Montgomery"
1752*a58d3d2aSXin Li fullname="Christopher &quot;Monty&quot; Montgomery"/>
1753*a58d3d2aSXin Li<date month="July" year="2002"/>
1754*a58d3d2aSXin Li</front>
1755*a58d3d2aSXin Li</reference>
1756*a58d3d2aSXin Li
1757*a58d3d2aSXin Li</references>
1758*a58d3d2aSXin Li
1759*a58d3d2aSXin Li<references title="Informative References">
1760*a58d3d2aSXin Li
1761*a58d3d2aSXin Li<!--?rfc include="http://xml.resource.org/public/rfc/bibxml/reference.RFC.3550.xml"?-->
1762*a58d3d2aSXin Li &rfc4732;
1763*a58d3d2aSXin Li &rfc6982;
1764*a58d3d2aSXin Li &rfc7587;
1765*a58d3d2aSXin Li
1766*a58d3d2aSXin Li<reference anchor="flac"
1767*a58d3d2aSXin Li target="https://xiph.org/flac/format.html">
1768*a58d3d2aSXin Li  <front>
1769*a58d3d2aSXin Li    <title>FLAC - Free Lossless Audio Codec Format Description</title>
1770*a58d3d2aSXin Li    <author initials="J." surname="Coalson" fullname="Josh Coalson"/>
1771*a58d3d2aSXin Li    <date month="January" year="2008"/>
1772*a58d3d2aSXin Li  </front>
1773*a58d3d2aSXin Li</reference>
1774*a58d3d2aSXin Li
1775*a58d3d2aSXin Li<reference anchor="hanning"
1776*a58d3d2aSXin Li target="https://en.wikipedia.org/w/index.php?title=Window_function&amp;oldid=703074467#Hann_.28Hanning.29_window">
1777*a58d3d2aSXin Li  <front>
1778*a58d3d2aSXin Li    <title>Hann window</title>
1779*a58d3d2aSXin Li    <author>
1780*a58d3d2aSXin Li      <organization>Wikipedia</organization>
1781*a58d3d2aSXin Li    </author>
1782*a58d3d2aSXin Li    <date month="February" year="2016"/>
1783*a58d3d2aSXin Li  </front>
1784*a58d3d2aSXin Li</reference>
1785*a58d3d2aSXin Li
1786*a58d3d2aSXin Li<reference anchor="linear-prediction"
1787*a58d3d2aSXin Li target="https://en.wikipedia.org/w/index.php?title=Linear_predictive_coding&amp;oldid=687498962">
1788*a58d3d2aSXin Li  <front>
1789*a58d3d2aSXin Li    <title>Linear Predictive Coding</title>
1790*a58d3d2aSXin Li    <author>
1791*a58d3d2aSXin Li      <organization>Wikipedia</organization>
1792*a58d3d2aSXin Li    </author>
1793*a58d3d2aSXin Li    <date month="October" year="2015"/>
1794*a58d3d2aSXin Li  </front>
1795*a58d3d2aSXin Li</reference>
1796*a58d3d2aSXin Li
1797*a58d3d2aSXin Li<reference anchor="lpc-sample"
1798*a58d3d2aSXin Li  target="https://svn.xiph.org/trunk/vorbis/lib/lpc.c">
1799*a58d3d2aSXin Li<front>
1800*a58d3d2aSXin Li  <title>Autocorrelation LPC coeff generation algorithm
1801*a58d3d2aSXin Li    (Vorbis source code)</title>
1802*a58d3d2aSXin Li<author initials="J." surname="Degener" fullname="Jutta Degener"/>
1803*a58d3d2aSXin Li<author initials="C." surname="Bormann" fullname="Carsten Bormann"/>
1804*a58d3d2aSXin Li<date month="November" year="1994"/>
1805*a58d3d2aSXin Li</front>
1806*a58d3d2aSXin Li</reference>
1807*a58d3d2aSXin Li
1808*a58d3d2aSXin Li<reference anchor="q-notation"
1809*a58d3d2aSXin Li target="https://en.wikipedia.org/w/index.php?title=Q_%28number_format%29&amp;oldid=697252615">
1810*a58d3d2aSXin Li<front>
1811*a58d3d2aSXin Li<title>Q (number format)</title>
1812*a58d3d2aSXin Li<author><organization>Wikipedia</organization></author>
1813*a58d3d2aSXin Li<date month="December" year="2015"/>
1814*a58d3d2aSXin Li</front>
1815*a58d3d2aSXin Li</reference>
1816*a58d3d2aSXin Li
1817*a58d3d2aSXin Li<reference anchor="replay-gain"
1818*a58d3d2aSXin Li target="https://wiki.xiph.org/VorbisComment#Replay_Gain">
1819*a58d3d2aSXin Li<front>
1820*a58d3d2aSXin Li<title>VorbisComment: Replay Gain</title>
1821*a58d3d2aSXin Li<author initials="C." surname="Parker" fullname="Conrad Parker"/>
1822*a58d3d2aSXin Li<author initials="M." surname="Leese" fullname="Martin Leese"/>
1823*a58d3d2aSXin Li<date month="June" year="2009"/>
1824*a58d3d2aSXin Li</front>
1825*a58d3d2aSXin Li</reference>
1826*a58d3d2aSXin Li
1827*a58d3d2aSXin Li<reference anchor="seeking"
1828*a58d3d2aSXin Li target="https://wiki.xiph.org/Seeking">
1829*a58d3d2aSXin Li<front>
1830*a58d3d2aSXin Li<title>Granulepos Encoding and How Seeking Really Works</title>
1831*a58d3d2aSXin Li<author initials="S." surname="Pfeiffer" fullname="Silvia Pfeiffer"/>
1832*a58d3d2aSXin Li<author initials="C." surname="Parker" fullname="Conrad Parker"/>
1833*a58d3d2aSXin Li<author initials="G." surname="Maxwell" fullname="Greg Maxwell"/>
1834*a58d3d2aSXin Li<date month="May" year="2012"/>
1835*a58d3d2aSXin Li</front>
1836*a58d3d2aSXin Li</reference>
1837*a58d3d2aSXin Li
1838*a58d3d2aSXin Li<reference anchor="vorbis-mapping"
1839*a58d3d2aSXin Li target="https://www.xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-810004.3.9">
1840*a58d3d2aSXin Li<front>
1841*a58d3d2aSXin Li<title>The Vorbis I Specification, Section 4.3.9 Output Channel Order</title>
1842*a58d3d2aSXin Li<author initials="C." surname="Montgomery"
1843*a58d3d2aSXin Li fullname="Christopher &quot;Monty&quot; Montgomery"/>
1844*a58d3d2aSXin Li<date month="January" year="2010"/>
1845*a58d3d2aSXin Li</front>
1846*a58d3d2aSXin Li</reference>
1847*a58d3d2aSXin Li
1848*a58d3d2aSXin Li<reference anchor="vorbis-trim"
1849*a58d3d2aSXin Li target="https://xiph.org/vorbis/doc/Vorbis_I_spec.html#x1-132000A.2">
1850*a58d3d2aSXin Li  <front>
1851*a58d3d2aSXin Li    <title>The Vorbis I Specification, Appendix&nbsp;A: Embedding Vorbis
1852*a58d3d2aSXin Li      into an Ogg stream</title>
1853*a58d3d2aSXin Li    <author initials="C." surname="Montgomery"
1854*a58d3d2aSXin Li     fullname="Christopher &quot;Monty&quot; Montgomery"/>
1855*a58d3d2aSXin Li    <date month="November" year="2008"/>
1856*a58d3d2aSXin Li  </front>
1857*a58d3d2aSXin Li</reference>
1858*a58d3d2aSXin Li
1859*a58d3d2aSXin Li<reference anchor="wave-multichannel"
1860*a58d3d2aSXin Li target="http://msdn.microsoft.com/en-us/windows/hardware/gg463006.aspx">
1861*a58d3d2aSXin Li  <front>
1862*a58d3d2aSXin Li    <title>Multiple Channel Audio Data and WAVE Files</title>
1863*a58d3d2aSXin Li    <author>
1864*a58d3d2aSXin Li      <organization>Microsoft Corporation</organization>
1865*a58d3d2aSXin Li    </author>
1866*a58d3d2aSXin Li    <date month="March" year="2007"/>
1867*a58d3d2aSXin Li  </front>
1868*a58d3d2aSXin Li</reference>
1869*a58d3d2aSXin Li
1870*a58d3d2aSXin Li</references>
1871*a58d3d2aSXin Li
1872*a58d3d2aSXin Li</back>
1873*a58d3d2aSXin Li</rfc>
1874