Source: lib/cea/mp4_cea_parser.js

  1. /*! @license
  2. * Shaka Player
  3. * Copyright 2016 Google LLC
  4. * SPDX-License-Identifier: Apache-2.0
  5. */
  6. goog.provide('shaka.cea.Mp4CeaParser');
  7. goog.require('goog.asserts');
  8. goog.require('shaka.cea.CeaUtils');
  9. goog.require('shaka.cea.SeiProcessor');
  10. goog.require('shaka.log');
  11. goog.require('shaka.media.ClosedCaptionParser');
  12. goog.require('shaka.util.DataViewReader');
  13. goog.require('shaka.util.Error');
  14. goog.require('shaka.util.Mp4Parser');
  15. goog.require('shaka.util.Mp4BoxParsers');
  16. /**
  17. * MPEG4 stream parser used for extracting 708 closed captions data.
  18. * @implements {shaka.extern.ICeaParser}
  19. * @export
  20. */
  21. shaka.cea.Mp4CeaParser = class {
  22. /** */
  23. constructor() {
  24. /**
  25. * SEI data processor.
  26. * @private
  27. * @const {!shaka.cea.SeiProcessor}
  28. */
  29. this.seiProcessor_ = new shaka.cea.SeiProcessor();
  30. /**
  31. * Map of track id to corresponding timescale.
  32. * @private {!Map<number, number>}
  33. */
  34. this.trackIdToTimescale_ = new Map();
  35. /**
  36. * Default sample duration, as specified by the TREX box.
  37. * @private {number}
  38. */
  39. this.defaultSampleDuration_ = 0;
  40. /**
  41. * Default sample size, as specified by the TREX box.
  42. * @private {number}
  43. */
  44. this.defaultSampleSize_ = 0;
  45. /**
  46. * @private {shaka.cea.Mp4CeaParser.BitstreamFormat}
  47. */
  48. this.bitstreamFormat_ = shaka.cea.Mp4CeaParser.BitstreamFormat.UNKNOWN;
  49. }
  50. /**
  51. * Parses the init segment. Gets Default Sample Duration and Size from the
  52. * TREX box, and constructs a map of Track IDs to timescales. Each TRAK box
  53. * contains a track header (TKHD) containing track ID, and a media header box
  54. * (MDHD) containing the timescale for the track
  55. * @override
  56. */
  57. init(initSegment) {
  58. const Mp4Parser = shaka.util.Mp4Parser;
  59. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  60. const trackIds = [];
  61. const timescales = [];
  62. const codecBoxParser = (box) => this.setBitstreamFormat_(box.name);
  63. new Mp4Parser()
  64. .box('moov', Mp4Parser.children)
  65. .box('mvex', Mp4Parser.children)
  66. .fullBox('trex', (box) => {
  67. const parsedTREXBox = shaka.util.Mp4BoxParsers.parseTREX(
  68. box.reader);
  69. this.defaultSampleDuration_ = parsedTREXBox.defaultSampleDuration;
  70. this.defaultSampleSize_ = parsedTREXBox.defaultSampleSize;
  71. })
  72. .box('trak', Mp4Parser.children)
  73. .fullBox('tkhd', (box) => {
  74. goog.asserts.assert(
  75. box.version != null,
  76. 'TKHD is a full box and should have a valid version.');
  77. const parsedTKHDBox = shaka.util.Mp4BoxParsers.parseTKHD(
  78. box.reader, box.version);
  79. trackIds.push(parsedTKHDBox.trackId);
  80. })
  81. .box('mdia', Mp4Parser.children)
  82. .fullBox('mdhd', (box) => {
  83. goog.asserts.assert(
  84. box.version != null,
  85. 'MDHD is a full box and should have a valid version.');
  86. const parsedMDHDBox = shaka.util.Mp4BoxParsers.parseMDHD(
  87. box.reader, box.version);
  88. timescales.push(parsedMDHDBox.timescale);
  89. })
  90. .box('minf', Mp4Parser.children)
  91. .box('stbl', Mp4Parser.children)
  92. .fullBox('stsd', Mp4Parser.sampleDescription)
  93. // These are the various boxes that signal a codec.
  94. .box('avc1', codecBoxParser)
  95. .box('avc3', codecBoxParser)
  96. .box('hev1', codecBoxParser)
  97. .box('hvc1', codecBoxParser)
  98. .box('dvh1', codecBoxParser)
  99. .box('dvhe', codecBoxParser)
  100. // This signals an encrypted sample, which we can go inside of to find
  101. // the codec used.
  102. .box('encv', Mp4Parser.visualSampleEntry)
  103. .box('sinf', Mp4Parser.children)
  104. .box('frma', (box) => {
  105. const {codec} = shaka.util.Mp4BoxParsers.parseFRMA(box.reader);
  106. this.setBitstreamFormat_(codec);
  107. })
  108. .parse(initSegment, /* partialOkay= */ true);
  109. // At least one track should exist, and each track should have a
  110. // corresponding Id in TKHD box, and timescale in its MDHD box
  111. if (!trackIds.length|| !timescales.length ||
  112. trackIds.length != timescales.length) {
  113. throw new shaka.util.Error(
  114. shaka.util.Error.Severity.CRITICAL,
  115. shaka.util.Error.Category.TEXT,
  116. shaka.util.Error.Code.INVALID_MP4_CEA);
  117. }
  118. if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) {
  119. shaka.log.alwaysWarn(
  120. 'Unable to determine bitstream format for CEA parsing!');
  121. }
  122. // Populate the map from track Id to timescale
  123. trackIds.forEach((trackId, idx) => {
  124. this.trackIdToTimescale_.set(trackId, timescales[idx]);
  125. });
  126. }
  127. /**
  128. * Parses each video segment. In fragmented MP4s, MOOF and MDAT come in
  129. * pairs. The following logic gets the necessary info from MOOFs to parse
  130. * MDATs (base media decode time, sample sizes/offsets/durations, etc),
  131. * and then parses the MDAT boxes for CEA-708 packets using this information.
  132. * CEA-708 packets are returned in the callback.
  133. * @override
  134. */
  135. parse(mediaSegment) {
  136. const Mp4Parser = shaka.util.Mp4Parser;
  137. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  138. if (this.bitstreamFormat_ == BitstreamFormat.UNKNOWN) {
  139. // We don't know how to extract SEI from this.
  140. return [];
  141. }
  142. /** @type {!Array<!shaka.extern.ICeaParser.CaptionPacket>} **/
  143. const captionPackets = [];
  144. // Fields that are found in MOOF boxes
  145. let defaultSampleDuration = this.defaultSampleDuration_;
  146. let defaultSampleSize = this.defaultSampleSize_;
  147. let moofOffset = 0;
  148. /** @type {!Array<shaka.util.ParsedTRUNBox>} */
  149. let parsedTRUNs = [];
  150. let baseMediaDecodeTime = null;
  151. let timescale = shaka.cea.CeaUtils.DEFAULT_TIMESCALE_VALUE;
  152. new Mp4Parser()
  153. .box('moof', (box) => {
  154. moofOffset = box.start;
  155. // trun box parsing is reset on each moof.
  156. parsedTRUNs = [];
  157. Mp4Parser.children(box);
  158. })
  159. .box('traf', Mp4Parser.children)
  160. .fullBox('trun', (box) => {
  161. goog.asserts.assert(
  162. box.version != null && box.flags!=null,
  163. 'TRUN is a full box and should have a valid version & flags.');
  164. const parsedTRUN = shaka.util.Mp4BoxParsers.parseTRUN(
  165. box.reader, box.version, box.flags);
  166. parsedTRUNs.push(parsedTRUN);
  167. })
  168. .fullBox('tfhd', (box) => {
  169. goog.asserts.assert(
  170. box.flags != null,
  171. 'TFHD is a full box and should have valid flags.');
  172. const parsedTFHD = shaka.util.Mp4BoxParsers.parseTFHD(
  173. box.reader, box.flags);
  174. // If specified, defaultSampleDuration and defaultSampleSize
  175. // override the ones specified in the TREX box
  176. defaultSampleDuration = parsedTFHD.defaultSampleDuration ||
  177. this.defaultSampleDuration_;
  178. defaultSampleSize = parsedTFHD.defaultSampleSize ||
  179. this.defaultSampleSize_;
  180. const trackId = parsedTFHD.trackId;
  181. // Get the timescale from the track Id
  182. if (this.trackIdToTimescale_.has(trackId)) {
  183. timescale = this.trackIdToTimescale_.get(trackId);
  184. }
  185. })
  186. .fullBox('tfdt', (box) => {
  187. goog.asserts.assert(
  188. box.version != null,
  189. 'TFDT is a full box and should have a valid version.');
  190. const parsedTFDT = shaka.util.Mp4BoxParsers.parseTFDTInaccurate(
  191. box.reader, box.version);
  192. baseMediaDecodeTime = parsedTFDT.baseMediaDecodeTime;
  193. })
  194. .box('mdat', (box) => {
  195. if (baseMediaDecodeTime === null) {
  196. // This field should have been populated by the Base Media Decode
  197. // Time in the tfdt box.
  198. shaka.log.alwaysWarn(
  199. 'Unable to find base media decode time for CEA captions!');
  200. throw new shaka.util.Error(
  201. shaka.util.Error.Severity.CRITICAL,
  202. shaka.util.Error.Category.TEXT,
  203. shaka.util.Error.Code.INVALID_MP4_CEA);
  204. }
  205. const offset = moofOffset - box.start - 8;
  206. this.parseMdat_(box.reader, baseMediaDecodeTime, timescale,
  207. defaultSampleDuration, defaultSampleSize, offset, parsedTRUNs,
  208. captionPackets);
  209. })
  210. .parse(mediaSegment, /* partialOkay= */ false);
  211. return captionPackets;
  212. }
  213. /**
  214. * Parse MDAT box.
  215. * @param {!shaka.util.DataViewReader} reader
  216. * @param {number} time
  217. * @param {number} timescale
  218. * @param {number} defaultSampleDuration
  219. * @param {number} defaultSampleSize
  220. * @param {number} offset
  221. * @param {!Array<shaka.util.ParsedTRUNBox>} parsedTRUNs
  222. * @param {!Array<!shaka.extern.ICeaParser.CaptionPacket>} captionPackets
  223. * @private
  224. */
  225. parseMdat_(reader, time, timescale, defaultSampleDuration,
  226. defaultSampleSize, offset, parsedTRUNs, captionPackets) {
  227. const BitstreamFormat = shaka.cea.Mp4CeaParser.BitstreamFormat;
  228. const CeaUtils = shaka.cea.CeaUtils;
  229. let sampleIndex = 0;
  230. // The fields in each ParsedTRUNSample contained in the sampleData
  231. // array are nullable. In the case of sample data and sample duration,
  232. // we use the defaults provided by the TREX/TFHD boxes. For sample
  233. // composition time offset, we default to 0.
  234. let sampleSize = defaultSampleSize;
  235. // Combine all sample data. This assumes that the samples described across
  236. // multiple trun boxes are still continuous in the mdat box.
  237. const sampleDatas = parsedTRUNs.map((t) => t.sampleData);
  238. const sampleData = [].concat(...sampleDatas);
  239. if (sampleData.length) {
  240. sampleSize = sampleData[0].sampleSize || defaultSampleSize;
  241. }
  242. reader.skip(offset + parsedTRUNs[0].dataOffset);
  243. while (reader.hasMoreData()) {
  244. const naluSize = reader.readUint32();
  245. const naluHeader = reader.readUint8();
  246. let naluType = null;
  247. let isSeiMessage = false;
  248. let naluHeaderSize = 1;
  249. goog.asserts.assert(this.bitstreamFormat_ != BitstreamFormat.UNKNOWN,
  250. 'Bitstream format should have been checked before now!');
  251. switch (this.bitstreamFormat_) {
  252. case BitstreamFormat.H264:
  253. naluType = naluHeader & 0x1f;
  254. isSeiMessage = naluType == CeaUtils.H264_NALU_TYPE_SEI;
  255. break;
  256. case BitstreamFormat.H265:
  257. naluHeaderSize = 2;
  258. reader.skip(1);
  259. naluType = (naluHeader >> 1) & 0x3f;
  260. isSeiMessage =
  261. naluType == CeaUtils.H265_PREFIX_NALU_TYPE_SEI ||
  262. naluType == CeaUtils.H265_SUFFIX_NALU_TYPE_SEI;
  263. break;
  264. default:
  265. return;
  266. }
  267. if (isSeiMessage) {
  268. let timeOffset = 0;
  269. if (sampleIndex < sampleData.length) {
  270. timeOffset = sampleData[sampleIndex].sampleCompositionTimeOffset || 0;
  271. }
  272. const pts = (time + timeOffset)/timescale;
  273. for (const packet of this.seiProcessor_
  274. .process(reader.readBytes(naluSize - naluHeaderSize))) {
  275. captionPackets.push({
  276. packet,
  277. pts,
  278. });
  279. }
  280. } else {
  281. try {
  282. reader.skip(naluSize - naluHeaderSize);
  283. } catch (e) {
  284. // It is necessary to ignore this error because it can break the start
  285. // of playback even if the user does not want to see the subtitles.
  286. break;
  287. }
  288. }
  289. sampleSize -= (naluSize + 4);
  290. if (sampleSize == 0) {
  291. if (sampleIndex < sampleData.length) {
  292. time += sampleData[sampleIndex].sampleDuration ||
  293. defaultSampleDuration;
  294. } else {
  295. time += defaultSampleDuration;
  296. }
  297. sampleIndex++;
  298. if (sampleIndex < sampleData.length) {
  299. sampleSize = sampleData[sampleIndex].sampleSize || defaultSampleSize;
  300. } else {
  301. sampleSize = defaultSampleSize;
  302. }
  303. }
  304. }
  305. }
  306. /**
  307. * @param {string} codec A fourcc for a codec.
  308. * @private
  309. */
  310. setBitstreamFormat_(codec) {
  311. if (codec in shaka.cea.Mp4CeaParser.CodecBitstreamMap_) {
  312. this.bitstreamFormat_ = shaka.cea.Mp4CeaParser.CodecBitstreamMap_[codec];
  313. }
  314. }
  315. };
  316. /** @enum {number} */
  317. shaka.cea.Mp4CeaParser.BitstreamFormat = {
  318. UNKNOWN: 0,
  319. H264: 1,
  320. H265: 2,
  321. };
  322. /** @private {Object.<string, shaka.cea.Mp4CeaParser.BitstreamFormat>} */
  323. shaka.cea.Mp4CeaParser.CodecBitstreamMap_ = {
  324. 'avc1': shaka.cea.Mp4CeaParser.BitstreamFormat.H264,
  325. 'avc3': shaka.cea.Mp4CeaParser.BitstreamFormat.H264,
  326. 'hev1': shaka.cea.Mp4CeaParser.BitstreamFormat.H265,
  327. 'hvc1': shaka.cea.Mp4CeaParser.BitstreamFormat.H265,
  328. // Dobly vision is also H265.
  329. 'dvh1': shaka.cea.Mp4CeaParser.BitstreamFormat.H265,
  330. 'dvhe': shaka.cea.Mp4CeaParser.BitstreamFormat.H265,
  331. };
  332. shaka.media.ClosedCaptionParser.registerParser('video/mp4',
  333. () => new shaka.cea.Mp4CeaParser());