Shaka Packager SDK
Loading...
Searching...
No Matches
webvtt_parser.cc
1// Copyright 2017 Google LLC. All rights reserved.
2//
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file or at
5// https://developers.google.com/open-source/licenses/bsd
6
7#include <packager/media/formats/webvtt/webvtt_parser.h>
8
9#include <absl/log/check.h>
10#include <absl/log/log.h>
11#include <absl/strings/numbers.h>
12#include <absl/strings/str_format.h>
13#include <absl/strings/str_split.h>
14
15#include <packager/kv_pairs/kv_pairs.h>
16#include <packager/media/base/text_stream_info.h>
17#include <packager/media/formats/webvtt/webvtt_utils.h>
18#include <packager/utils/string_trim_split.h>
19
20namespace shaka {
21namespace media {
22namespace {
23
24const uint64_t kStreamIndex = 0;
25
26std::string BlockToString(const std::string* block, size_t size) {
27 std::string out = " --- BLOCK START ---\n";
28
29 for (size_t i = 0; i < size; i++) {
30 out.append(" ");
31 out.append(block[i]);
32 out.append("\n");
33 }
34
35 out.append(" --- BLOCK END ---");
36
37 return out;
38}
39
40// Comments are just blocks that are preceded by a blank line, start with the
41// word "NOTE" (followed by a space or newline), and end at the first blank
42// line.
43// SOURCE: https://www.w3.org/TR/webvtt1
44bool IsLikelyNote(const std::string& line) {
45 return line == "NOTE" || absl::StartsWith(line, "NOTE ") ||
46 absl::StartsWith(line, "NOTE\t");
47}
48
49// As cue time is the only part of a WEBVTT file that is allowed to have
50// "-->" appear, then if the given line contains it, we can safely assume
51// that the line is likely to be a cue time.
52bool IsLikelyCueTiming(const std::string& line) {
53 return line.find("-->") != std::string::npos;
54}
55
56// A WebVTT cue identifier is any sequence of one or more characters not
57// containing the substring "-->" (U+002D HYPHEN-MINUS, U+002D HYPHEN-MINUS,
58// U+003E GREATER-THAN SIGN), nor containing any U+000A LINE FEED (LF)
59// characters or U+000D CARRIAGE RETURN (CR) characters.
60// SOURCE: https://www.w3.org/TR/webvtt1/#webvtt-cue-identifier
61bool MaybeCueId(const std::string& line) {
62 return line.find("-->") == std::string::npos;
63}
64
65// Check to see if the block is likely a style block. Style blocks are
66// identified as any block that starts with a line that only contains
67// "STYLE".
68// SOURCE: https://w3c.github.io/webvtt/#styling
69bool IsLikelyStyle(const std::string& line) {
70 return absl::StripTrailingAsciiWhitespace(line) == "STYLE";
71}
72
73// Check to see if the block is likely a region block. Region blocks are
74// identified as any block that starts with a line that only contains
75// "REGION".
76// SOURCE: https://w3c.github.io/webvtt/#webvtt-region
77bool IsLikelyRegion(const std::string& line) {
78 return absl::StripTrailingAsciiWhitespace(line) == "REGION";
79}
80
81bool ParsePercent(const std::string& str, float* value) {
82 // https://www.w3.org/TR/webvtt1/#webvtt-percentage
83 // E.g. "4%" or "1.5%"
84 if (str[str.size() - 1] != '%') {
85 return false;
86 }
87
88 double temp;
89 if (!absl::SimpleAtod(str.substr(0, str.size() - 1), &temp) || temp > 100) {
90 return false;
91 }
92 *value = temp;
93 return true;
94}
95
96bool ParseDoublePercent(const std::string& str, float* a, float* b) {
97 std::vector<std::string> percents = SplitAndTrimSkipEmpty(str, ',');
98
99 if (percents.size() != 2) {
100 return false;
101 }
102 float temp_a, temp_b;
103 if (!ParsePercent(percents[0], &temp_a) ||
104 !ParsePercent(percents[1], &temp_b)) {
105 return false;
106 }
107 *a = temp_a;
108 *b = temp_b;
109 return true;
110}
111
112void ParseSettings(const std::string& id,
113 const std::string& value,
114 TextSettings* settings) {
115 // https://www.w3.org/TR/webvtt1/#ref-for-parse-the-webvtt-cue-settings-1
116 if (id == "region") {
117 settings->region = value;
118 } else if (id == "vertical") {
119 if (value == "rl") {
120 settings->writing_direction = WritingDirection::kVerticalGrowingLeft;
121 } else if (value == "lr") {
122 settings->writing_direction = WritingDirection::kVerticalGrowingRight;
123 } else {
124 LOG(WARNING) << "Invalid WebVTT vertical setting: " << value;
125 }
126 } else if (id == "line") {
127 const auto pos = value.find(',');
128 const std::string line = value.substr(0, pos);
129 const std::string align =
130 pos != std::string::npos ? value.substr(pos + 1) : "";
131 if (pos != std::string::npos) {
132 LOG(WARNING) << "WebVTT line alignment isn't supported";
133 }
134
135 if (!line.empty() && line[line.size() - 1] == '%') {
136 float temp;
137 if (!ParsePercent(line, &temp)) {
138 LOG(WARNING) << "Invalid WebVTT line: " << value;
139 return;
140 }
141 settings->line.emplace(temp, TextUnitType::kPercent);
142 } else {
143 double temp;
144 if (!absl::SimpleAtod(line, &temp)) {
145 LOG(WARNING) << "Invalid WebVTT line: " << value;
146 return;
147 }
148 settings->line.emplace(temp, TextUnitType::kLines);
149 }
150 } else if (id == "position") {
151 const auto pos = value.find(',');
152 const std::string position = value.substr(0, pos);
153 const std::string align =
154 pos != std::string::npos ? value.substr(pos + 1) : "";
155 if (pos != std::string::npos) {
156 LOG(WARNING) << "WebVTT position alignment isn't supported";
157 }
158
159 float temp;
160 if (ParsePercent(position, &temp)) {
161 settings->position.emplace(temp, TextUnitType::kPercent);
162 } else {
163 LOG(WARNING) << "Invalid WebVTT position: " << value;
164 }
165 } else if (id == "size") {
166 float temp;
167 if (ParsePercent(value, &temp)) {
168 settings->width.emplace(temp, TextUnitType::kPercent);
169 } else {
170 LOG(WARNING) << "Invalid WebVTT size: " << value;
171 }
172 } else if (id == "align") {
173 if (value == "start") {
174 settings->text_alignment = TextAlignment::kStart;
175 } else if (value == "center" || value == "middle") {
176 settings->text_alignment = TextAlignment::kCenter;
177 } else if (value == "end") {
178 settings->text_alignment = TextAlignment::kEnd;
179 } else if (value == "left") {
180 settings->text_alignment = TextAlignment::kLeft;
181 } else if (value == "right") {
182 settings->text_alignment = TextAlignment::kRight;
183 } else {
184 LOG(WARNING) << "Invalid WebVTT align: " << value;
185 }
186 } else {
187 LOG(WARNING) << "Unknown WebVTT setting: " << id;
188 }
189}
190
191} // namespace
192
193WebVttParser::WebVttParser() {}
194
195void WebVttParser::Init(const InitCB& init_cb,
196 const NewMediaSampleCB& new_media_sample_cb,
197 const NewTextSampleCB& new_text_sample_cb,
198 KeySource* decryption_key_source) {
199 DCHECK(init_cb_ == nullptr);
200 DCHECK(init_cb != nullptr);
201 DCHECK(new_text_sample_cb != nullptr);
202 DCHECK(!decryption_key_source) << "Encrypted WebVTT not supported";
203
204 init_cb_ = init_cb;
205 new_text_sample_cb_ = new_text_sample_cb;
206}
207
209 reader_.Flush();
210 return Parse();
211}
212
213bool WebVttParser::Parse(const uint8_t* buf, int size) {
214 reader_.PushData(buf, size);
215 return Parse();
216}
217
218bool WebVttParser::Parse() {
219 if (!initialized_) {
220 std::vector<std::string> block;
221 if (!reader_.Next(&block)) {
222 return true;
223 }
224
225 // Check the header. It is possible for a 0xFEFF BOM to come before the
226 // header text.
227 if (block.size() != 1) {
228 LOG(WARNING) << "Failed to read WEBVTT header - "
229 << "block size should be 1 but was " << block.size() << ".";
230 }
231 if (block[0] != "WEBVTT" && block[0] != "\xEF\xBB\xBFWEBVTT") {
232 LOG(WARNING) << "Failed to read WEBVTT header - should be WEBVTT but was "
233 << block[0];
234 }
235 initialized_ = true;
236 }
237
238 std::vector<std::string> block;
239 while (reader_.Next(&block)) {
240 if (!ParseBlock(block))
241 return false;
242 }
243 return true;
244}
245
246bool WebVttParser::ParseBlock(const std::vector<std::string>& block) {
247 // NOTE
248 if (IsLikelyNote(block[0])) {
249 // We can safely ignore the whole block.
250 return true;
251 }
252
253 // STYLE
254 if (IsLikelyStyle(block[0])) {
255 if (saw_cue_) {
256 LOG(WARNING)
257 << "Found style block after seeing cue. Ignoring style block";
258 } else {
259 for (size_t i = 1; i < block.size(); i++) {
260 if (!css_styles_.empty())
261 css_styles_ += "\n";
262 css_styles_ += block[i];
263 }
264 }
265 return true;
266 }
267
268 // REGION
269 if (IsLikelyRegion(block[0])) {
270 if (saw_cue_) {
271 LOG(WARNING)
272 << "Found region block after seeing cue. Ignoring region block";
273 return true;
274 } else {
275 return ParseRegion(block);
276 }
277 }
278
279 // CUE with ID
280 if (block.size() >= 2 && MaybeCueId(block[0]) &&
281 IsLikelyCueTiming(block[1]) && ParseCueWithId(block)) {
282 saw_cue_ = true;
283 return true;
284 }
285
286 // CUE with no ID
287 if (IsLikelyCueTiming(block[0]) && ParseCueWithNoId(block)) {
288 saw_cue_ = true;
289 return true;
290 }
291
292 LOG(ERROR) << "Failed to determine block classification:\n"
293 << BlockToString(block.data(), block.size());
294 return false;
295}
296
297bool WebVttParser::ParseRegion(const std::vector<std::string>& block) {
298 TextRegion region;
299 std::string region_id;
300 // Fill in defaults. Some may already be this, but set them anyway.
301 // See https://www.w3.org/TR/webvtt1/#regions
302 region.width.value = 100;
303 region.width.type = TextUnitType::kPercent;
304 region.height.value = 3;
305 region.height.type = TextUnitType::kLines;
306 region.window_anchor_x.value = 0;
307 region.window_anchor_x.type = TextUnitType::kPercent;
308 region.window_anchor_y.value = 100;
309 region.window_anchor_y.type = TextUnitType::kPercent;
310 region.region_anchor_x.value = 0;
311 region.region_anchor_x.type = TextUnitType::kPercent;
312 region.region_anchor_y.value = 100;
313 region.region_anchor_y.type = TextUnitType::kPercent;
314
315 bool first = true;
316 for (const auto& line : block) {
317 // First line is "REGION", skip.
318 if (first) {
319 first = false;
320 continue;
321 }
322
323 std::vector<KVPair> kv_pairs = SplitStringIntoKeyValuePairs(line, ':', ' ');
324
325 for (const auto& pair : kv_pairs) {
326 const std::string& value = pair.second;
327 if (pair.first == "id") {
328 if (value.find("-->") != std::string::npos) {
329 LOG(ERROR) << "Invalid WebVTT REGION ID: " << value;
330 return false;
331 }
332 if (regions_.find(value) != regions_.end()) {
333 LOG(ERROR) << "Duplicate WebVTT REGION: " << value;
334 return false;
335 }
336 region_id = value;
337 } else if (pair.first == "width") {
338 if (!ParsePercent(value, &region.width.value)) {
339 LOG(ERROR) << "Invalid WebVTT REGION width: " << value;
340 return false;
341 }
342 } else if (pair.first == "lines") {
343 unsigned int temp;
344 if (!absl::SimpleAtoi(value, &temp)) {
345 LOG(ERROR) << "Invalid WebVTT REGION lines: " << value;
346 return false;
347 }
348 region.height.value = temp;
349 } else if (pair.first == "regionanchor") {
350 if (!ParseDoublePercent(value, &region.region_anchor_x.value,
351 &region.region_anchor_y.value)) {
352 LOG(ERROR) << "Invalid WebVTT REGION regionanchor: " << value;
353 return false;
354 }
355 } else if (pair.first == "viewportanchor") {
356 if (!ParseDoublePercent(value, &region.window_anchor_x.value,
357 &region.window_anchor_y.value)) {
358 LOG(ERROR) << "Invalid WebVTT REGION windowanchor: " << value;
359 return false;
360 }
361 } else if (pair.first == "scroll") {
362 if (value != "up") {
363 LOG(ERROR) << "Invalid WebVTT REGION scroll: " << value;
364 return false;
365 }
366 region.scroll = true;
367 } else {
368 LOG(ERROR) << "Unknown WebVTT REGION setting: " << pair.first;
369 return false;
370 }
371 }
372 }
373 if (region_id.empty()) {
374 LOG(ERROR) << "WebVTT REGION id is required";
375 return false;
376 }
377 regions_.insert(std::make_pair(region_id, std::move(region)));
378 return true;
379}
380
381bool WebVttParser::ParseCueWithNoId(const std::vector<std::string>& block) {
382 return ParseCue("", block.data(), block.size());
383}
384
385bool WebVttParser::ParseCueWithId(const std::vector<std::string>& block) {
386 return ParseCue(block[0], block.data() + 1, block.size() - 1);
387}
388
389bool WebVttParser::ParseCue(const std::string& id,
390 const std::string* block,
391 size_t block_size) {
392 std::vector<std::string> time_and_style =
393 SplitAndTrimSkipEmpty(block[0], ' ');
394
395 int64_t start_time = 0;
396 int64_t end_time = 0;
397
398 const bool parsed_time =
399 time_and_style.size() >= 3 && time_and_style[1] == "-->" &&
400 WebVttTimestampToMs(time_and_style[0], &start_time) &&
401 WebVttTimestampToMs(time_and_style[2], &end_time);
402
403 if (!parsed_time) {
404 LOG(ERROR) << "Could not parse start time, -->, and end time from "
405 << block[0];
406 return false;
407 }
408
409 if (!stream_info_dispatched_)
410 DispatchTextStreamInfo();
411
412 // According to the WebVTT spec end time must be greater than the start time
413 // of the cue. Since we are seeing content with invalid times in the field, we
414 // are going to drop the cue instead of failing to package.
415 //
416 // For more context see:
417 // - https://www.w3.org/TR/webvtt1/#webvtt-cue-timings
418 // - https://github.com/shaka-project/shaka-packager/issues/335
419 // - https://github.com/shaka-project/shaka-packager/issues/425
420 //
421 // Print a warning so that those packaging content can know that their
422 // content is not spec compliant.
423 if (end_time <= start_time) {
424 LOG(WARNING) << "WebVTT input is not spec compliant. Start time ("
425 << start_time << ") should be less than end time (" << end_time
426 << "). Skipping webvtt cue:"
427 << BlockToString(block, block_size);
428 return true;
429 }
430
431 TextSettings settings;
432 for (size_t i = 3; i < time_and_style.size(); i++) {
433 const auto pos = time_and_style[i].find(':');
434 if (pos == std::string::npos) {
435 continue;
436 }
437
438 const std::string key = time_and_style[i].substr(0, pos);
439 const std::string value = time_and_style[i].substr(pos + 1);
440 ParseSettings(key, value, &settings);
441 }
442
443 // The rest of the block is the payload.
444 // TODO: Parse tags to support <b>, <i>, etc.
445 TextFragment body;
446 TextFragmentStyle no_styles;
447 for (size_t i = 1; i < block_size; i++) {
448 if (i > 1 && i != block_size) {
449 body.sub_fragments.emplace_back(no_styles, /* newline= */ true);
450 }
451 body.sub_fragments.emplace_back(no_styles, block[i]);
452 }
453
454 const auto sample =
455 std::make_shared<TextSample>(id, start_time, end_time, settings, body);
456 return new_text_sample_cb_(kStreamIndex, sample);
457}
458
459void WebVttParser::DispatchTextStreamInfo() {
460 stream_info_dispatched_ = true;
461
462 const int kTrackId = 0;
463 // The resolution of timings are in milliseconds.
464 const int kTimescale = 1000;
465 // The duration passed here is not very important. Also the whole file
466 // must be read before determining the real duration which doesn't
467 // work nicely with the current demuxer.
468 const int kDuration = 0;
469 const char kWebVttCodecString[] = "wvtt";
470 const int64_t kNoWidth = 0;
471 const int64_t kNoHeight = 0;
472 // The language of the stream will be overwritten by the Demuxer later.
473 const char kNoLanguage[] = "";
474
475 const auto stream = std::make_shared<TextStreamInfo>(
476 kTrackId, kTimescale, kDuration, kCodecWebVtt, kWebVttCodecString, "",
477 kNoWidth, kNoHeight, kNoLanguage);
478 stream->set_css_styles(css_styles_);
479 for (const auto& pair : regions_)
480 stream->AddRegion(pair.first, pair.second);
481
482 std::vector<std::shared_ptr<StreamInfo>> streams{stream};
483 init_cb_(streams);
484}
485
486} // namespace media
487} // namespace shaka
void PushData(const uint8_t *data, size_t data_size)
Pushes data onto the end of the buffer.
bool Next(std::vector< std::string > *out)
KeySource is responsible for encryption key acquisition.
Definition key_source.h:52
std::function< bool(uint32_t track_id, std::shared_ptr< MediaSample > media_sample)> NewMediaSampleCB
std::function< bool(uint32_t track_id, std::shared_ptr< TextSample > text_sample)> NewTextSampleCB
std::function< void(const std::vector< std::shared_ptr< StreamInfo > > &stream_info)> InitCB
void Init(const InitCB &init_cb, const NewMediaSampleCB &new_media_sample_cb, const NewTextSampleCB &new_text_sample_cb, KeySource *decryption_key_source) override
bool Parse(const uint8_t *buf, int size) override
All the methods that are virtual are virtual for mocking.