Shaka Packager SDK
Loading...
Searching...
No Matches
language_utils.cc
1// Copyright 2015 Google LLC. All rights reserved.
2//
3// Use of this source code is governed by a BSD-style
4// license that can be found in the LICENSE file or at
5// https://developers.google.com/open-source/licenses/bsd
6
7#include <packager/media/base/language_utils.h>
8
9#include <iterator>
10
11#include <absl/log/check.h>
12#include <absl/log/log.h>
13
14namespace {
15
16// A map from 3-letter language codes (ISO 639-2) to 2-letter language codes
17// (ISO 639-1) for all languages which have both in the registry.
18typedef struct {
19 const char iso_639_2[4]; // 3 letters + nul
20 const char iso_639_1[3]; // 2 letters + nul
21} LanguageMapPairType;
22const LanguageMapPairType kLanguageMap[] = {
23 {"aar", "aa"}, {"abk", "ab"}, {"afr", "af"}, {"aka", "ak"}, {"alb", "sq"},
24 {"amh", "am"}, {"ara", "ar"}, {"arg", "an"}, {"arm", "hy"}, {"asm", "as"},
25 {"ava", "av"}, {"ave", "ae"}, {"aym", "ay"}, {"aze", "az"}, {"bak", "ba"},
26 {"bam", "bm"}, {"baq", "eu"}, {"bel", "be"}, {"ben", "bn"}, {"bih", "bh"},
27 {"bis", "bi"}, {"bod", "bo"}, {"bos", "bs"}, {"bre", "br"}, {"bul", "bg"},
28 {"bur", "my"}, {"cat", "ca"}, {"ces", "cs"}, {"cha", "ch"}, {"che", "ce"},
29 {"chi", "zh"}, {"chu", "cu"}, {"chv", "cv"}, {"cor", "kw"}, {"cos", "co"},
30 {"cre", "cr"}, {"cym", "cy"}, {"cze", "cs"}, {"dan", "da"}, {"deu", "de"},
31 {"div", "dv"}, {"dut", "nl"}, {"dzo", "dz"}, {"ell", "el"}, {"eng", "en"},
32 {"epo", "eo"}, {"est", "et"}, {"eus", "eu"}, {"ewe", "ee"}, {"fao", "fo"},
33 {"fas", "fa"}, {"fij", "fj"}, {"fin", "fi"}, {"fra", "fr"}, {"fre", "fr"},
34 {"fry", "fy"}, {"ful", "ff"}, {"geo", "ka"}, {"ger", "de"}, {"gla", "gd"},
35 {"gle", "ga"}, {"glg", "gl"}, {"glv", "gv"}, {"gre", "el"}, {"grn", "gn"},
36 {"guj", "gu"}, {"hat", "ht"}, {"hau", "ha"}, {"heb", "he"}, {"heb", "iw"},
37 {"her", "hz"}, {"hin", "hi"}, {"hmo", "ho"}, {"hrv", "hr"}, {"hun", "hu"},
38 {"hye", "hy"}, {"ibo", "ig"}, {"ice", "is"}, {"ido", "io"}, {"iii", "ii"},
39 {"iku", "iu"}, {"ile", "ie"}, {"ina", "ia"}, {"ind", "id"}, {"ipk", "ik"},
40 {"isl", "is"}, {"ita", "it"}, {"jav", "jv"}, {"jpn", "ja"}, {"kal", "kl"},
41 {"kan", "kn"}, {"kas", "ks"}, {"kat", "ka"}, {"kau", "kr"}, {"kaz", "kk"},
42 {"khm", "km"}, {"kik", "ki"}, {"kin", "rw"}, {"kir", "ky"}, {"kom", "kv"},
43 {"kon", "kg"}, {"kor", "ko"}, {"kua", "kj"}, {"kur", "ku"}, {"lao", "lo"},
44 {"lat", "la"}, {"lav", "lv"}, {"lim", "li"}, {"lin", "ln"}, {"lit", "lt"},
45 {"ltz", "lb"}, {"lub", "lu"}, {"lug", "lg"}, {"mac", "mk"}, {"mah", "mh"},
46 {"mal", "ml"}, {"mao", "mi"}, {"mar", "mr"}, {"may", "ms"}, {"mkd", "mk"},
47 {"mlg", "mg"}, {"mlt", "mt"}, {"mon", "mn"}, {"mri", "mi"}, {"msa", "ms"},
48 {"mya", "my"}, {"nau", "na"}, {"nav", "nv"}, {"nbl", "nr"}, {"nde", "nd"},
49 {"ndo", "ng"}, {"nep", "ne"}, {"nld", "nl"}, {"nno", "nn"}, {"nob", "nb"},
50 {"nor", "no"}, {"nya", "ny"}, {"oci", "oc"}, {"oji", "oj"}, {"ori", "or"},
51 {"orm", "om"}, {"oss", "os"}, {"pan", "pa"}, {"per", "fa"}, {"pli", "pi"},
52 {"pol", "pl"}, {"por", "pt"}, {"pus", "ps"}, {"que", "qu"}, {"roh", "rm"},
53 {"ron", "ro"}, {"rum", "ro"}, {"run", "rn"}, {"rus", "ru"}, {"sag", "sg"},
54 {"san", "sa"}, {"sin", "si"}, {"slk", "sk"}, {"slo", "sk"}, {"slv", "sl"},
55 {"sme", "se"}, {"smo", "sm"}, {"sna", "sn"}, {"snd", "sd"}, {"som", "so"},
56 {"sot", "st"}, {"spa", "es"}, {"sqi", "sq"}, {"srd", "sc"}, {"srp", "sr"},
57 {"ssw", "ss"}, {"sun", "su"}, {"swa", "sw"}, {"swe", "sv"}, {"tah", "ty"},
58 {"tam", "ta"}, {"tat", "tt"}, {"tel", "te"}, {"tgk", "tg"}, {"tgl", "tl"},
59 {"tha", "th"}, {"tib", "bo"}, {"tir", "ti"}, {"ton", "to"}, {"tsn", "tn"},
60 {"tso", "ts"}, {"tuk", "tk"}, {"tur", "tr"}, {"twi", "tw"}, {"uig", "ug"},
61 {"ukr", "uk"}, {"urd", "ur"}, {"uzb", "uz"}, {"ven", "ve"}, {"vie", "vi"},
62 {"vol", "vo"}, {"wel", "cy"}, {"wln", "wa"}, {"wol", "wo"}, {"xho", "xh"},
63 {"yid", "yi"}, {"yor", "yo"}, {"zha", "za"}, {"zho", "zh"}, {"zul", "zu"},
64};
65
66void SplitLanguageTag(const std::string& tag,
67 std::string* main_language, std::string* subtag) {
68 // Split the main language from its subtag (if any).
69 *main_language = tag;
70 subtag->clear();
71 size_t dash = main_language->find('-');
72 if (dash != std::string::npos) {
73 *subtag = main_language->substr(dash);
74 main_language->erase(dash);
75 }
76}
77
78} // namespace
79
80namespace shaka {
81
82std::string LanguageToShortestForm(const std::string& language) {
83 // Do not try to mangle blank strings.
84 if (language.size() == 0) {
85 return language;
86 }
87
88 std::string main_language;
89 std::string subtag;
90 SplitLanguageTag(language, &main_language, &subtag);
91
92 if (main_language.size() == 2) {
93 // Presumably already a valid ISO-639-1 code, and therefore conforms to
94 // BCP-47's requirement to use the shortest possible code.
95 return main_language + subtag;
96 }
97
98 for (size_t i = 0; i < std::size(kLanguageMap); ++i) {
99 if (main_language == kLanguageMap[i].iso_639_2) {
100 return kLanguageMap[i].iso_639_1 + subtag;
101 }
102 }
103
104 // This could happen legitimately for languages which have no 2-letter code,
105 // but that would imply that the input language code is a 3-letter code.
106 DCHECK_EQ(3u, main_language.size()) << main_language;
107 return main_language + subtag;
108}
109
110std::string LanguageToISO_639_2(const std::string& language) {
111 std::string main_language;
112 std::string subtag;
113 SplitLanguageTag(language, &main_language, &subtag);
114
115 if (main_language.size() == 3) {
116 // Presumably already a valid ISO-639-2 code.
117 return main_language + subtag;
118 }
119
120 for (size_t i = 0; i < std::size(kLanguageMap); ++i) {
121 if (main_language == kLanguageMap[i].iso_639_1) {
122 return kLanguageMap[i].iso_639_2 + subtag;
123 }
124 }
125
126 LOG(WARNING) << "No equivalent 3-letter language code for " << main_language;
127 // This is probably a mistake on the part of the user and should be treated
128 // as invalid input.
129 return "und";
130}
131
132} // namespace shaka
All the methods that are virtual are virtual for mocking.
std::string LanguageToISO_639_2(const std::string &language)
std::string LanguageToShortestForm(const std::string &language)