Shaka Packager SDK
webvtt_parser.cc
1 // Copyright 2017 Google LLC. All rights reserved.
2 //
3 // Use of this source code is governed by a BSD-style
4 // license that can be found in the LICENSE file or at
5 // https://developers.google.com/open-source/licenses/bsd
6 
7 #include <packager/media/formats/webvtt/webvtt_parser.h>
8 
9 #include <absl/log/check.h>
10 #include <absl/log/log.h>
11 #include <absl/strings/numbers.h>
12 #include <absl/strings/str_format.h>
13 #include <absl/strings/str_split.h>
14 
15 #include <packager/kv_pairs/kv_pairs.h>
16 #include <packager/media/base/text_stream_info.h>
17 #include <packager/media/formats/webvtt/webvtt_utils.h>
18 #include <packager/utils/string_trim_split.h>
19 
20 namespace shaka {
21 namespace media {
22 namespace {
23 
24 const uint64_t kStreamIndex = 0;
25 
26 std::string BlockToString(const std::string* block, size_t size) {
27  std::string out = " --- BLOCK START ---\n";
28 
29  for (size_t i = 0; i < size; i++) {
30  out.append(" ");
31  out.append(block[i]);
32  out.append("\n");
33  }
34 
35  out.append(" --- BLOCK END ---");
36 
37  return out;
38 }
39 
40 // Comments are just blocks that are preceded by a blank line, start with the
41 // word "NOTE" (followed by a space or newline), and end at the first blank
42 // line.
43 // SOURCE: https://www.w3.org/TR/webvtt1
44 bool IsLikelyNote(const std::string& line) {
45  return line == "NOTE" || absl::StartsWith(line, "NOTE ") ||
46  absl::StartsWith(line, "NOTE\t");
47 }
48 
49 // As cue time is the only part of a WEBVTT file that is allowed to have
50 // "-->" appear, then if the given line contains it, we can safely assume
51 // that the line is likely to be a cue time.
52 bool IsLikelyCueTiming(const std::string& line) {
53  return line.find("-->") != std::string::npos;
54 }
55 
56 // A WebVTT cue identifier is any sequence of one or more characters not
57 // containing the substring "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS,
58 // U+003E GREATER-THAN SIGN), nor containing any U+000A LINE FEED (LF)
59 // characters or U+000D CARRIAGE RETURN (CR) characters.
60 // SOURCE: https://www.w3.org/TR/webvtt1/#webvtt-cue-identifier
61 bool MaybeCueId(const std::string& line) {
62  return line.find("-->") == std::string::npos;
63 }
64 
65 // Check to see if the block is likely a style block. Style blocks are
66 // identified as any block that starts with a line that only contains
67 // "STYLE".
68 // SOURCE: https://w3c.github.io/webvtt/#styling
69 bool IsLikelyStyle(const std::string& line) {
70  return absl::StripTrailingAsciiWhitespace(line) == "STYLE";
71 }
72 
73 // Check to see if the block is likely a region block. Region blocks are
74 // identified as any block that starts with a line that only contains
75 // "REGION".
76 // SOURCE: https://w3c.github.io/webvtt/#webvtt-region
77 bool IsLikelyRegion(const std::string& line) {
78  return absl::StripTrailingAsciiWhitespace(line) == "REGION";
79 }
80 
81 bool ParsePercent(const std::string& str, float* value) {
82  // https://www.w3.org/TR/webvtt1/#webvtt-percentage
83  // E.g. "4%" or "1.5%"
84  if (str[str.size() - 1] != '%') {
85  return false;
86  }
87 
88  double temp;
89  if (!absl::SimpleAtod(str.substr(0, str.size() - 1), &temp) || temp > 100) {
90  return false;
91  }
92  *value = temp;
93  return true;
94 }
95 
96 bool ParseDoublePercent(const std::string& str, float* a, float* b) {
97  std::vector<std::string> percents = SplitAndTrimSkipEmpty(str, ',');
98 
99  if (percents.size() != 2) {
100  return false;
101  }
102  float temp_a, temp_b;
103  if (!ParsePercent(percents[0], &temp_a) ||
104  !ParsePercent(percents[1], &temp_b)) {
105  return false;
106  }
107  *a = temp_a;
108  *b = temp_b;
109  return true;
110 }
111 
112 void ParseSettings(const std::string& id,
113  const std::string& value,
114  TextSettings* settings) {
115  // https://www.w3.org/TR/webvtt1/#ref-for-parse-the-webvtt-cue-settings-1
116  if (id == "region") {
117  settings->region = value;
118  } else if (id == "vertical") {
119  if (value == "rl") {
120  settings->writing_direction = WritingDirection::kVerticalGrowingLeft;
121  } else if (value == "lr") {
122  settings->writing_direction = WritingDirection::kVerticalGrowingRight;
123  } else {
124  LOG(WARNING) << "Invalid WebVTT vertical setting: " << value;
125  }
126  } else if (id == "line") {
127  const auto pos = value.find(',');
128  const std::string line = value.substr(0, pos);
129  const std::string align =
130  pos != std::string::npos ? value.substr(pos + 1) : "";
131  if (pos != std::string::npos) {
132  LOG(WARNING) << "WebVTT line alignment isn't supported";
133  }
134 
135  if (!line.empty() && line[line.size() - 1] == '%') {
136  float temp;
137  if (!ParsePercent(line, &temp)) {
138  LOG(WARNING) << "Invalid WebVTT line: " << value;
139  return;
140  }
141  settings->line.emplace(temp, TextUnitType::kPercent);
142  } else {
143  double temp;
144  if (!absl::SimpleAtod(line, &temp)) {
145  LOG(WARNING) << "Invalid WebVTT line: " << value;
146  return;
147  }
148  settings->line.emplace(temp, TextUnitType::kLines);
149  }
150  } else if (id == "position") {
151  const auto pos = value.find(',');
152  const std::string position = value.substr(0, pos);
153  const std::string align =
154  pos != std::string::npos ? value.substr(pos + 1) : "";
155  if (pos != std::string::npos) {
156  LOG(WARNING) << "WebVTT position alignment isn't supported";
157  }
158 
159  float temp;
160  if (ParsePercent(position, &temp)) {
161  settings->position.emplace(temp, TextUnitType::kPercent);
162  } else {
163  LOG(WARNING) << "Invalid WebVTT position: " << value;
164  }
165  } else if (id == "size") {
166  float temp;
167  if (ParsePercent(value, &temp)) {
168  settings->width.emplace(temp, TextUnitType::kPercent);
169  } else {
170  LOG(WARNING) << "Invalid WebVTT size: " << value;
171  }
172  } else if (id == "align") {
173  if (value == "start") {
174  settings->text_alignment = TextAlignment::kStart;
175  } else if (value == "center" || value == "middle") {
176  settings->text_alignment = TextAlignment::kCenter;
177  } else if (value == "end") {
178  settings->text_alignment = TextAlignment::kEnd;
179  } else if (value == "left") {
180  settings->text_alignment = TextAlignment::kLeft;
181  } else if (value == "right") {
182  settings->text_alignment = TextAlignment::kRight;
183  } else {
184  LOG(WARNING) << "Invalid WebVTT align: " << value;
185  }
186  } else {
187  LOG(WARNING) << "Unknown WebVTT setting: " << id;
188  }
189 }
190 
191 } // namespace
192 
193 WebVttParser::WebVttParser() {}
194 
195 void WebVttParser::Init(const InitCB& init_cb,
196  const NewMediaSampleCB& new_media_sample_cb,
197  const NewTextSampleCB& new_text_sample_cb,
198  KeySource* decryption_key_source) {
199  DCHECK(init_cb_ == nullptr);
200  DCHECK(init_cb != nullptr);
201  DCHECK(new_text_sample_cb != nullptr);
202  DCHECK(!decryption_key_source) << "Encrypted WebVTT not supported";
203 
204  init_cb_ = init_cb;
205  new_text_sample_cb_ = new_text_sample_cb;
206 }
207 
209  reader_.Flush();
210  return Parse();
211 }
212 
213 bool WebVttParser::Parse(const uint8_t* buf, int size) {
214  reader_.PushData(buf, size);
215  return Parse();
216 }
217 
218 bool WebVttParser::Parse() {
219  if (!initialized_) {
220  std::vector<std::string> block;
221  if (!reader_.Next(&block)) {
222  return true;
223  }
224 
225  // Check the header. It is possible for a 0xFEFF BOM to come before the
226  // header text.
227  if (block.size() != 1) {
228  LOG(WARNING) << "Failed to read WEBVTT header - "
229  << "block size should be 1 but was " << block.size() << ".";
230  }
231  if (block[0] != "WEBVTT" && block[0] != "\xEF\xBB\xBFWEBVTT") {
232  LOG(WARNING) << "Failed to read WEBVTT header - should be WEBVTT but was "
233  << block[0];
234  }
235  initialized_ = true;
236  }
237 
238  std::vector<std::string> block;
239  while (reader_.Next(&block)) {
240  if (!ParseBlock(block))
241  return false;
242  }
243  return true;
244 }
245 
246 bool WebVttParser::ParseBlock(const std::vector<std::string>& block) {
247  // NOTE
248  if (IsLikelyNote(block[0])) {
249  // We can safely ignore the whole block.
250  return true;
251  }
252 
253  // STYLE
254  if (IsLikelyStyle(block[0])) {
255  if (saw_cue_) {
256  LOG(WARNING)
257  << "Found style block after seeing cue. Ignoring style block";
258  } else {
259  for (size_t i = 1; i < block.size(); i++) {
260  if (!css_styles_.empty())
261  css_styles_ += "\n";
262  css_styles_ += block[i];
263  }
264  }
265  return true;
266  }
267 
268  // REGION
269  if (IsLikelyRegion(block[0])) {
270  if (saw_cue_) {
271  LOG(WARNING)
272  << "Found region block after seeing cue. Ignoring region block";
273  return true;
274  } else {
275  return ParseRegion(block);
276  }
277  }
278 
279  // CUE with ID
280  if (block.size() >= 2 && MaybeCueId(block[0]) &&
281  IsLikelyCueTiming(block[1]) && ParseCueWithId(block)) {
282  saw_cue_ = true;
283  return true;
284  }
285 
286  // CUE with no ID
287  if (IsLikelyCueTiming(block[0]) && ParseCueWithNoId(block)) {
288  saw_cue_ = true;
289  return true;
290  }
291 
292  LOG(ERROR) << "Failed to determine block classification:\n"
293  << BlockToString(block.data(), block.size());
294  return false;
295 }
296 
297 bool WebVttParser::ParseRegion(const std::vector<std::string>& block) {
298  TextRegion region;
299  std::string region_id;
300  // Fill in defaults. Some may already be this, but set them anyway.
301  // See https://www.w3.org/TR/webvtt1/#regions
302  region.width.value = 100;
303  region.width.type = TextUnitType::kPercent;
304  region.height.value = 3;
305  region.height.type = TextUnitType::kLines;
306  region.window_anchor_x.value = 0;
307  region.window_anchor_x.type = TextUnitType::kPercent;
308  region.window_anchor_y.value = 100;
309  region.window_anchor_y.type = TextUnitType::kPercent;
310  region.region_anchor_x.value = 0;
311  region.region_anchor_x.type = TextUnitType::kPercent;
312  region.region_anchor_y.value = 100;
313  region.region_anchor_y.type = TextUnitType::kPercent;
314 
315  bool first = true;
316  for (const auto& line : block) {
317  // First line is "REGION", skip.
318  if (first) {
319  first = false;
320  continue;
321  }
322 
323  std::vector<KVPair> kv_pairs = SplitStringIntoKeyValuePairs(line, ':', ' ');
324 
325  for (const auto& pair : kv_pairs) {
326  const std::string& value = pair.second;
327  if (pair.first == "id") {
328  if (value.find("-->") != std::string::npos) {
329  LOG(ERROR) << "Invalid WebVTT REGION ID: " << value;
330  return false;
331  }
332  if (regions_.find(value) != regions_.end()) {
333  LOG(ERROR) << "Duplicate WebVTT REGION: " << value;
334  return false;
335  }
336  region_id = value;
337  } else if (pair.first == "width") {
338  if (!ParsePercent(value, &region.width.value)) {
339  LOG(ERROR) << "Invalid WebVTT REGION width: " << value;
340  return false;
341  }
342  } else if (pair.first == "lines") {
343  unsigned int temp;
344  if (!absl::SimpleAtoi(value, &temp)) {
345  LOG(ERROR) << "Invalid WebVTT REGION lines: " << value;
346  return false;
347  }
348  region.height.value = temp;
349  } else if (pair.first == "regionanchor") {
350  if (!ParseDoublePercent(value, &region.region_anchor_x.value,
351  &region.region_anchor_y.value)) {
352  LOG(ERROR) << "Invalid WebVTT REGION regionanchor: " << value;
353  return false;
354  }
355  } else if (pair.first == "viewportanchor") {
356  if (!ParseDoublePercent(value, &region.window_anchor_x.value,
357  &region.window_anchor_y.value)) {
358  LOG(ERROR) << "Invalid WebVTT REGION windowanchor: " << value;
359  return false;
360  }
361  } else if (pair.first == "scroll") {
362  if (value != "up") {
363  LOG(ERROR) << "Invalid WebVTT REGION scroll: " << value;
364  return false;
365  }
366  region.scroll = true;
367  } else {
368  LOG(ERROR) << "Unknown WebVTT REGION setting: " << pair.first;
369  return false;
370  }
371  }
372  }
373  if (region_id.empty()) {
374  LOG(ERROR) << "WebVTT REGION id is required";
375  return false;
376  }
377  regions_.insert(std::make_pair(region_id, std::move(region)));
378  return true;
379 }
380 
381 bool WebVttParser::ParseCueWithNoId(const std::vector<std::string>& block) {
382  return ParseCue("", block.data(), block.size());
383 }
384 
385 bool WebVttParser::ParseCueWithId(const std::vector<std::string>& block) {
386  return ParseCue(block[0], block.data() + 1, block.size() - 1);
387 }
388 
389 bool WebVttParser::ParseCue(const std::string& id,
390  const std::string* block,
391  size_t block_size) {
392  std::vector<std::string> time_and_style =
393  SplitAndTrimSkipEmpty(block[0], ' ');
394 
395  int64_t start_time = 0;
396  int64_t end_time = 0;
397 
398  const bool parsed_time =
399  time_and_style.size() >= 3 && time_and_style[1] == "-->" &&
400  WebVttTimestampToMs(time_and_style[0], &start_time) &&
401  WebVttTimestampToMs(time_and_style[2], &end_time);
402 
403  if (!parsed_time) {
404  LOG(ERROR) << "Could not parse start time, -->, and end time from "
405  << block[0];
406  return false;
407  }
408 
409  if (!stream_info_dispatched_)
410  DispatchTextStreamInfo();
411 
412  // According to the WebVTT spec end time must be greater than the start time
413  // of the cue. Since we are seeing content with invalid times in the field, we
414  // are going to drop the cue instead of failing to package.
415  //
416  // For more context see:
417  // - https://www.w3.org/TR/webvtt1/#webvtt-cue-timings
418  // - https://github.com/shaka-project/shaka-packager/issues/335
419  // - https://github.com/shaka-project/shaka-packager/issues/425
420  //
421  // Print a warning so that those packaging content can know that their
422  // content is not spec compliant.
423  if (end_time <= start_time) {
424  LOG(WARNING) << "WebVTT input is not spec compliant. Start time ("
425  << start_time << ") should be less than end time (" << end_time
426  << "). Skipping webvtt cue:"
427  << BlockToString(block, block_size);
428  return true;
429  }
430 
431  TextSettings settings;
432  for (size_t i = 3; i < time_and_style.size(); i++) {
433  const auto pos = time_and_style[i].find(':');
434  if (pos == std::string::npos) {
435  continue;
436  }
437 
438  const std::string key = time_and_style[i].substr(0, pos);
439  const std::string value = time_and_style[i].substr(pos + 1);
440  ParseSettings(key, value, &settings);
441  }
442 
443  // The rest of the block is the payload.
444  // TODO: Parse tags to support <b>, <i>, etc.
445  TextFragment body;
446  TextFragmentStyle no_styles;
447  for (size_t i = 1; i < block_size; i++) {
448  if (i > 1 && i != block_size) {
449  body.sub_fragments.emplace_back(no_styles, /* newline= */ true);
450  }
451  body.sub_fragments.emplace_back(no_styles, block[i]);
452  }
453 
454  const auto sample =
455  std::make_shared<TextSample>(id, start_time, end_time, settings, body);
456  return new_text_sample_cb_(kStreamIndex, sample);
457 }
458 
459 void WebVttParser::DispatchTextStreamInfo() {
460  stream_info_dispatched_ = true;
461 
462  const int kTrackId = 0;
463  // The resolution of timings are in milliseconds.
464  const int kTimescale = 1000;
465  // The duration passed here is not very important. Also the whole file
466  // must be read before determining the real duration which doesn't
467  // work nicely with the current demuxer.
468  const int kDuration = 0;
469  const char kWebVttCodecString[] = "wvtt";
470  const int64_t kNoWidth = 0;
471  const int64_t kNoHeight = 0;
472  // The language of the stream will be overwritten by the Demuxer later.
473  const char kNoLanguage[] = "";
474 
475  const auto stream = std::make_shared<TextStreamInfo>(
476  kTrackId, kTimescale, kDuration, kCodecWebVtt, kWebVttCodecString, "",
477  kNoWidth, kNoHeight, kNoLanguage);
478  stream->set_css_styles(css_styles_);
479  for (const auto& pair : regions_)
480  stream->AddRegion(pair.first, pair.second);
481 
482  std::vector<std::shared_ptr<StreamInfo>> streams{stream};
483  init_cb_(streams);
484 }
485 
486 } // namespace media
487 } // namespace shaka
void PushData(const uint8_t *data, size_t data_size)
Pushes data onto the end of the buffer.
Definition: text_readers.cc:74
bool Next(std::vector< std::string > *out)
Definition: text_readers.cc:79
KeySource is responsible for encryption key acquisition.
Definition: key_source.h:52
std::function< bool(uint32_t track_id, std::shared_ptr< MediaSample > media_sample)> NewMediaSampleCB
Definition: media_parser.h:45
std::function< bool(uint32_t track_id, std::shared_ptr< TextSample > text_sample)> NewTextSampleCB
Definition: media_parser.h:54
std::function< void(const std::vector< std::shared_ptr< StreamInfo > > &stream_info)> InitCB
Definition: media_parser.h:36
void Init(const InitCB &init_cb, const NewMediaSampleCB &new_media_sample_cb, const NewTextSampleCB &new_text_sample_cb, KeySource *decryption_key_source) override
bool Parse(const uint8_t *buf, int size) override
All the methods that are virtual are virtual for mocking.
Definition: crypto_flags.cc:66