feat: add position data to captions (#434)

videojs · Jul 21, 2023 · 30f2132 · 30f2132
1 parent dc56f1b
commit 30f2132
Show file tree

Hide file tree

Showing 6 changed files with 431 additions and 140 deletions.
diff --git a/lib/flv/coalesce-stream.js b/lib/flv/coalesce-stream.js
@@ -35,7 +35,7 @@ var CoalesceStream = function(options) {
   this.push = function(output) {
     // buffer incoming captions until the associated video segment
     // finishes
-    if (output.text) {
+    if (output.content || output.text) {
       return this.pendingCaptions.push(output);
     }
     // buffer incoming id3 tags until the final flush

diff --git a/lib/m2ts/caption-stream.js b/lib/m2ts/caption-stream.js
@@ -1231,10 +1231,12 @@ var ROWS = [0x1100, 0x1120, 0x1200, 0x1220, 0x1500, 0x1520, 0x1600, 0x1620,
 
 // CEA-608 captions are rendered onto a 34x15 matrix of character
 // cells. The "bottom" row is the last element in the outer array.
+// We keep track of positioning information as we go by storing the
+// number of indentations and the tab offset in this buffer.
 var createDisplayBuffer = function() {
   var result = [], i = BOTTOM_ROW + 1;
   while (i--) {
-    result.push('');
+    result.push({ text: '', indent: 0, offset: 0 });
   }
   return result;
 };
@@ -1312,9 +1314,9 @@ var Cea608Stream = function(field, dataChannel) {
 
     } else if (data === this.BACKSPACE_) {
       if (this.mode_ === 'popOn') {
-        this.nonDisplayed_[this.row_] = this.nonDisplayed_[this.row_].slice(0, -1);
+        this.nonDisplayed_[this.row_].text = this.nonDisplayed_[this.row_].text.slice(0, -1);
       } else {
-        this.displayed_[this.row_] = this.displayed_[this.row_].slice(0, -1);
+        this.displayed_[this.row_].text = this.displayed_[this.row_].text.slice(0, -1);
       }
     } else if (data === this.ERASE_DISPLAYED_MEMORY_) {
       this.flushDisplayed(packet.pts);
@@ -1352,9 +1354,9 @@ var Cea608Stream = function(field, dataChannel) {
 
       // Delete the previous character
       if (this.mode_ === 'popOn') {
-        this.nonDisplayed_[this.row_] = this.nonDisplayed_[this.row_].slice(0, -1);
+        this.nonDisplayed_[this.row_].text = this.nonDisplayed_[this.row_].text.slice(0, -1);
       } else {
-        this.displayed_[this.row_] = this.displayed_[this.row_].slice(0, -1);
+        this.displayed_[this.row_].text = this.displayed_[this.row_].text.slice(0, -1);
       }
 
       // Bitmask char0 so that we can apply character transformations
@@ -1390,7 +1392,13 @@ var Cea608Stream = function(field, dataChannel) {
       // increments, with an additional offset code of 1-3 to reach any
       // of the 32 columns specified by CEA-608. So all we need to do
       // here is increment the column cursor by the given offset.
-      this.column_ += (char1 & 0x03);
+      const offset = (char1 & 0x03);
+
+      // For an offest value 1-3, set the offset for that caption
+      // in the non-displayed array.
+      this.nonDisplayed_[this.row_].offset = offset;
+
+      this.column_ += offset;
 
     // Detect PACs (Preamble Address Codes)
     } else if (this.isPAC(char0, char1)) {
@@ -1427,7 +1435,11 @@ var Cea608Stream = function(field, dataChannel) {
         // increments the column cursor by 4, so we can get the desired
         // column position by bit-shifting to the right (to get n/2)
         // and multiplying by 4.
-        this.column_ = ((data & 0xe) >> 1) * 4;
+        const indentations = ((data & 0xe) >> 1);
+
+        this.column_ = indentations * 4;
+        // add to the number of indentations for positioning
+        this.nonDisplayed_[this.row_].indent += indentations;
       }
 
       if (this.isColorPAC(char1)) {
@@ -1458,32 +1470,51 @@ Cea608Stream.prototype = new Stream();
 // Trigger a cue point that captures the current state of the
 // display buffer
 Cea608Stream.prototype.flushDisplayed = function(pts) {
-  var content = this.displayed_
-    // remove spaces from the start and end of the string
-    .map(function(row, index) {
+  const logWarning = (index) => {
+    this.trigger('log', {
+      level: 'warn',
+      message: 'Skipping a malformed 608 caption at index ' + index + '.'
+    });
+  };
+  const content = [];
+
+  this.displayed_.forEach((row, i) => {
+    if (row && row.text && row.text.length) {
+
       try {
-        return row.trim();
+        // remove spaces from the start and end of the string
+        row.text = row.text.trim();
       } catch (e) {
         // Ordinarily, this shouldn't happen. However, caption
         // parsing errors should not throw exceptions and
         // break playback.
-        this.trigger('log', {
-          level: 'warn',
-          message: 'Skipping a malformed 608 caption at index ' + index + '.'
+        logWarning(i);
+      }
+      // See the below link for more details on the following fields:
+      // https://dvcs.w3.org/hg/text-tracks/raw-file/default/608toVTT/608toVTT.html#positioning-in-cea-608
+      if (row.text.length) {
+        content.push({
+          // The text to be displayed in the caption from this specific row, with whitespace removed.
+          text: row.text,
+          // Value between 1 and 15 representing the PAC row used to calculate line height.
+          line: i + 1,
+          // A number representing the indent position by percentage (CEA-608 PAC indent code).
+          // The value will be a number between 10 and 80. Offset is used to add an aditional
+          // value to the position if necessary.
+          position: 10 + Math.min(70, row.indent * 10) + (row.offset * 2.5),
         });
-        return '';
       }
-    }, this)
-    // combine all text rows to display in one cue
-    .join('\n')
-    // and remove blank rows from the start and end, but not the middle
-    .replace(/^\n+|\n+$/g, '');
+    }
+    else if (row === undefined || row === null) {
+      logWarning(i);
+    }
+  });
 
   if (content.length) {
     this.trigger('data', {
       startPts: this.startPts_,
       endPts: pts,
-      text: content,
+      content,
       stream: this.name_
     });
   }
@@ -1686,7 +1717,7 @@ Cea608Stream.prototype.setRollUp = function(pts, newBaseRow) {
     // move currently displayed captions (up or down) to the new base row
     for (var i = 0; i < this.rollUpRows_; i++) {
       this.displayed_[newBaseRow - i] = this.displayed_[this.row_ - i];
-      this.displayed_[this.row_ - i] = '';
+      this.displayed_[this.row_ - i] = { text: '', indent: 0, offset: 0 };
     }
   }
 
@@ -1722,43 +1753,43 @@ Cea608Stream.prototype.clearFormatting = function(pts) {
 
 // Mode Implementations
 Cea608Stream.prototype.popOn = function(pts, text) {
-  var baseRow = this.nonDisplayed_[this.row_];
+  var baseRow = this.nonDisplayed_[this.row_].text;
 
   // buffer characters
   baseRow += text;
-  this.nonDisplayed_[this.row_] = baseRow;
+  this.nonDisplayed_[this.row_].text = baseRow;
 };
 
 Cea608Stream.prototype.rollUp = function(pts, text) {
-  var baseRow = this.displayed_[this.row_];
+  var baseRow = this.displayed_[this.row_].text;
 
   baseRow += text;
-  this.displayed_[this.row_] = baseRow;
+  this.displayed_[this.row_].text = baseRow;
 
 };
 
 Cea608Stream.prototype.shiftRowsUp_ = function() {
   var i;
   // clear out inactive rows
   for (i = 0; i < this.topRow_; i++) {
-    this.displayed_[i] = '';
+    this.displayed_[i] = { text: '', indent: 0, offset: 0 };
   }
   for (i = this.row_ + 1; i < BOTTOM_ROW + 1; i++) {
-    this.displayed_[i] = '';
+    this.displayed_[i] = { text: '', indent: 0, offset: 0 };
   }
   // shift displayed rows up
   for (i = this.topRow_; i < this.row_; i++) {
     this.displayed_[i] = this.displayed_[i + 1];
   }
   // clear out the bottom row
-  this.displayed_[this.row_] = '';
+  this.displayed_[this.row_] = { text: '', indent: 0, offset: 0 };
 };
 
 Cea608Stream.prototype.paintOn = function(pts, text) {
-  var baseRow = this.displayed_[this.row_];
+  var baseRow = this.displayed_[this.row_].text;
 
   baseRow += text;
-  this.displayed_[this.row_] = baseRow;
+  this.displayed_[this.row_].text = baseRow;
 };
 
 // exports

diff --git a/lib/mp4/caption-parser.js b/lib/mp4/caption-parser.js
@@ -245,7 +245,10 @@ var parseCaptionNals = function(segment, videoTrackId) {
   * @return {?Object[]} parsedCaptions - A list of captions or null if no video tracks
   * @return {Number} parsedCaptions[].startTime - The time to show the caption in seconds
   * @return {Number} parsedCaptions[].endTime - The time to stop showing the caption in seconds
-  * @return {String} parsedCaptions[].text - The visible content of the caption
+  * @return {Object[]} parsedCaptions[].content - A list of individual caption segments
+  * @return {String} parsedCaptions[].content.text - The visible content of the caption segment
+  * @return {Number} parsedCaptions[].content.line - The line height from 1-15 for positioning of the caption segment
+  * @return {Number} parsedCaptions[].content.position - The column indent percentage for cue positioning from 10-80
  **/
 var parseEmbeddedCaptions = function(segment, trackId, timescale) {
   var captionNals;

diff --git a/lib/mp4/transmuxer.js b/lib/mp4/transmuxer.js
@@ -727,7 +727,7 @@ CoalesceStream = function(options, metadataStream) {
   this.push = function(output) {
     // buffer incoming captions until the associated video segment
     // finishes
-    if (output.text) {
+    if (output.content || output.text) {
       return this.pendingCaptions.push(output);
     }
     // buffer incoming id3 tags until the final flush

diff --git a/test/caption-parser.test.js b/test/caption-parser.test.js
@@ -49,7 +49,7 @@ QUnit.test('parse captions from real segment', function(assert) {
   cc = captionParser.parse(dashSegment, trackIds, timescales);
 
   assert.equal(cc.captions.length, 1);
-  assert.equal(cc.captions[0].text, '00:00:00',
+  assert.equal(cc.captions[0].content[0].text, '00:00:00',
     'real segment caption has correct text');
   assert.equal(cc.captions[0].stream, 'CC1',
     'real segment caption has correct stream');
@@ -86,7 +86,7 @@ QUnit.test('parseTrackId for version 0 and version 1 boxes', function(assert) {
     { 1: 90000 }); // timescales);
 
   assert.equal(v0Captions.captions.length, 1, 'got 1 version0 caption');
-  assert.equal(v0Captions.captions[0].text, 'test string #1',
+  assert.equal(v0Captions.captions[0].content[0].text, 'test string #1',
     'got the expected version0 caption text');
   assert.equal(v0Captions.captions[0].stream, 'CC1',
     'returned the correct caption stream CC1');
@@ -108,7 +108,7 @@ QUnit.test('parseTrackId for version 0 and version 1 boxes', function(assert) {
     { 2: 90000 }); // timescales
 
   assert.equal(v1Captions.captions.length, 1, 'got version1 caption');
-  assert.equal(v1Captions.captions[0].text, 'test string #2',
+  assert.equal(v1Captions.captions[0].content[0].text, 'test string #2',
     'got the expected version1 caption text');
   assert.equal(v1Captions.captions[0].stream, 'CC4',
     'returned the correct caption stream CC4');