liblcf
Loading...
Searching...
No Matches
reader_util.cpp
Go to the documentation of this file.
1/*
2 * This file is part of liblcf. Copyright (c) liblcf authors.
3 * https://github.com/EasyRPG/liblcf - https://easyrpg.org
4 *
5 * liblcf is Free/Libre Open Source Software, released under the MIT License.
6 * For the full copyright and license information, please view the COPYING
7 * file that was distributed with this source code.
8 */
9
10#include "lcf/config.h"
11#include "lcf/scope_guard.h"
12
13#if LCF_SUPPORT_ICU
14# include <unicode/ucsdet.h>
15# include <unicode/ucnv.h>
16# include <unicode/normalizer2.h>
17# include <unicode/unistr.h>
18# include <unicode/locid.h>
19#else
20# ifdef _MSC_VER
21# error MSVC builds require ICU
22# endif
23#endif
24
25#ifdef _WIN32
26# include <windows.h>
27#else
28# if !LCF_SUPPORT_ICU
29# include <iconv.h>
30# endif
31# include <locale>
32#endif
33
34#include <algorithm>
35#include <cstdio>
36#include <cstdlib>
37#include <sstream>
38#include <vector>
39
40#include "lcf/inireader.h"
41#include "lcf/ldb/reader.h"
42#include "lcf/reader_util.h"
43
44namespace lcf {
45
46namespace ReaderUtil {
47}
48
49std::string ReaderUtil::CodepageToEncoding(int codepage) {
50 if (codepage == 0)
51 return std::string();
52
53 if (codepage == 932) {
54#if LCF_SUPPORT_ICU
55 return "ibm-943_P15A-2003";
56#else
57 return "SHIFT_JIS";
58#endif
59 }
60 if (codepage == 949) {
61#if LCF_SUPPORT_ICU
62 return "windows-949-2000";
63#else
64 return "cp949";
65#endif
66 }
67 std::ostringstream out;
68#if LCF_SUPPORT_ICU
69 out << "windows-" << codepage;
70#else
71 out << "CP" << codepage;
72#endif
73
74 // Looks like a valid codepage
75 std::string outs = out.str();
76 return outs;
77}
78
79std::string ReaderUtil::DetectEncoding(lcf::rpg::Database& db) {
80 std::vector<std::string> encodings = DetectEncodings(db);
81
82 if (encodings.empty()) {
83 return "";
84 }
85
86 return encodings.front();
87}
88
89std::vector<std::string> ReaderUtil::DetectEncodings(lcf::rpg::Database& db) {
90#if LCF_SUPPORT_ICU
91 std::ostringstream text;
92
93 auto append = [](const auto& s) {
94 return ToString(s) + " ";
95 };
96
97 lcf::rpg::ForEachString(db.system, [&](const auto& val, const auto& ctx) {
98 text << append(val);
99 });
100
101 // Cannot use ForEachString here for Terms:
102 // Too much untranslated garbage data in there, even in default database
103 for (const auto& s: {
104 db.terms.menu_save,
105 db.terms.menu_quit,
106 db.terms.new_game,
107 db.terms.load_game,
108 db.terms.exit_game,
109 db.terms.status,
110 db.terms.row,
111 db.terms.order,
112 db.terms.wait_on,
113 db.terms.wait_off,
114 db.terms.level,
115 db.terms.health_points,
116 db.terms.spirit_points,
117 db.terms.normal_status,
118 db.terms.sp_cost,
119 db.terms.attack,
120 db.terms.defense,
121 db.terms.spirit,
122 db.terms.agility,
123 db.terms.weapon,
124 db.terms.shield,
125 db.terms.armor,
126 db.terms.helmet,
127 db.terms.accessory,
128 db.terms.save_game_message,
129 db.terms.load_game_message,
130 db.terms.exit_game_message,
131 db.terms.file,
132 db.terms.yes,
133 db.terms.no
134 }) {
135 text << append(s);
136 }
137
138 return ReaderUtil::DetectEncodings(text.str());
139#else
140 return std::vector<std::string>();
141#endif
142}
143
144std::string ReaderUtil::DetectEncoding(StringView string) {
145 std::vector<std::string> encodings = DetectEncodings(string);
146
147 if (encodings.empty()) {
148 return "";
149 }
150
151 return encodings.front();
152}
153
154std::vector<std::string> ReaderUtil::DetectEncodings(StringView string) {
155std::vector<std::string> encodings;
156#if LCF_SUPPORT_ICU
157 if (!string.empty()) {
158 UErrorCode status = U_ZERO_ERROR;
159 UCharsetDetector* detector = ucsdet_open(&status);
160
161 auto s = std::string(string);
162 ucsdet_setText(detector, s.c_str(), s.length(), &status);
163
164 int32_t matches_count;
165 const UCharsetMatch** matches = ucsdet_detectAll(detector, &matches_count, &status);
166
167 if (matches != nullptr) {
168 // Collect all candidates, most confident comes first
169 for (int i = 0; i < matches_count; ++i) {
170 std::string encoding = ucsdet_getName(matches[i], &status);
171
172 // Fixes to ensure proper Windows encodings
173 if (encoding == "Shift_JIS") {
174 encodings.emplace_back("ibm-943_P15A-2003"); // Japanese with \ as backslash
175 } else if (encoding == "EUC-KR") {
176 encodings.emplace_back("windows-949-2000"); // Korean with \ as backlash
177 } else if (encoding == "GB18030") {
178 encodings.emplace_back("windows-936-2000"); // Simplified Chinese
179 } else if (encoding == "ISO-8859-1" || encoding == "windows-1252") {
180 encodings.emplace_back("ibm-5348_P100-1997"); // Occidental with Euro
181 } else if (encoding == "ISO-8859-2" || encoding == "windows-1250") {
182 encodings.emplace_back("ibm-5346_P100-1998"); // Central Europe with Euro
183 } else if (encoding == "ISO-8859-5" || encoding == "windows-1251") {
184 encodings.emplace_back("ibm-5347_P100-1998"); // Cyrillic with Euro
185 } else if (encoding == "ISO-8859-6" || encoding == "windows-1256") {
186 encodings.emplace_back("ibm-9448_X100-2005"); // Arabic with Euro + 8 chars
187 } else if (encoding == "ISO-8859-7" || encoding == "windows-1253") {
188 encodings.emplace_back("ibm-5349_P100-1998"); // Greek with Euro
189 } else if (encoding == "ISO-8859-8" || encoding == "windows-1255") {
190 encodings.emplace_back("ibm-9447_P100-2002"); // Hebrew with Euro
191 } else {
192 encodings.push_back(encoding);
193 }
194 }
195 }
196 ucsdet_close(detector);
197 }
198#endif
199
200 return encodings;
201}
202
203std::string ReaderUtil::GetEncoding(StringView ini_file) {
204 INIReader ini(ToString(ini_file));
205 if (ini.ParseError() != -1) {
206 std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
207 if (!encoding.empty()) {
208 return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
209 }
210 }
211 return std::string();
212}
213
214std::string ReaderUtil::GetEncoding(std::istream& filestream) {
215 INIReader ini(filestream);
216 if (ini.ParseError() != -1) {
217 std::string encoding = ini.Get("EasyRPG", "Encoding", std::string());
218 if (!encoding.empty()) {
219 return ReaderUtil::CodepageToEncoding(atoi(encoding.c_str()));
220 }
221 }
222 return std::string();
223}
224
225std::string ReaderUtil::GetLocaleEncoding() {
226#ifdef _WIN32
227 int codepage = GetACP();
228#elif __ANDROID__
229 // No std::locale support in NDK
230 // Doesn't really matter because the Android version auto-detects via ICU
231 int codepage = 1252;
232#else
233 int codepage = 1252;
234
235 std::locale loc = std::locale("");
236 // Gets the language and culture part only
237 std::string loc_full = loc.name().substr(0, loc.name().find_first_of("@."));
238 // Gets the language part only
239 std::string loc_lang = loc.name().substr(0, loc.name().find_first_of("_"));
240
241 if (loc_lang == "th") codepage = 874;
242 else if (loc_lang == "ja") codepage = 932;
243 else if (loc_full == "zh_CN" ||
244 loc_full == "zh_SG") codepage = 936;
245 else if (loc_lang == "ko") codepage = 949;
246 else if (loc_full == "zh_TW" ||
247 loc_full == "zh_HK") codepage = 950;
248 else if (loc_lang == "cs" ||
249 loc_lang == "hu" ||
250 loc_lang == "pl" ||
251 loc_lang == "ro" ||
252 loc_lang == "hr" ||
253 loc_lang == "sk" ||
254 loc_lang == "sl") codepage = 1250;
255 else if (loc_lang == "ru") codepage = 1251;
256 else if (loc_lang == "ca" ||
257 loc_lang == "da" ||
258 loc_lang == "de" ||
259 loc_lang == "en" ||
260 loc_lang == "es" ||
261 loc_lang == "fi" ||
262 loc_lang == "fr" ||
263 loc_lang == "it" ||
264 loc_lang == "nl" ||
265 loc_lang == "nb" ||
266 loc_lang == "pt" ||
267 loc_lang == "sv" ||
268 loc_lang == "eu") codepage = 1252;
269 else if (loc_lang == "el") codepage = 1253;
270 else if (loc_lang == "tr") codepage = 1254;
271 else if (loc_lang == "he") codepage = 1255;
272 else if (loc_lang == "ar") codepage = 1256;
273 else if (loc_lang == "et" ||
274 loc_lang == "lt" ||
275 loc_lang == "lv") codepage = 1257;
276 else if (loc_lang == "vi") codepage = 1258;
277#endif
278
279 return CodepageToEncoding(codepage);
280}
281
282std::string ReaderUtil::Recode(StringView str_to_encode, StringView source_encoding) {
283 return ReaderUtil::Recode(str_to_encode, source_encoding, "UTF-8");
284}
285
286std::string ReaderUtil::Recode(StringView str_to_encode,
287 StringView src_enc,
288 StringView dst_enc) {
289
290 if (src_enc.empty() || dst_enc.empty() || str_to_encode.empty()) {
291 return ToString(str_to_encode);
292 }
293
294 auto src_cp = SvAtoi(src_enc);
295 const auto& src_enc_str = src_cp > 0
296 ? ReaderUtil::CodepageToEncoding(src_cp)
297 : ToString(src_enc);
298
299 auto dst_cp = SvAtoi(dst_enc);
300 const auto& dst_enc_str = dst_cp > 0
301 ? ReaderUtil::CodepageToEncoding(dst_cp)
302 : ToString(dst_enc);
303
304#if LCF_SUPPORT_ICU
305 auto status = U_ZERO_ERROR;
306 auto conv_from = ucnv_open(src_enc_str.c_str(), &status);
307
308 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
309 fprintf(stderr, "liblcf: ucnv_open() error for source encoding \"%s\": %s\n", src_enc_str.c_str(), u_errorName(status));
310 return std::string();
311 }
312 status = U_ZERO_ERROR;
313 auto conv_from_sg = makeScopeGuard([&]() { ucnv_close(conv_from); });
314
315 auto conv_to = ucnv_open(dst_enc_str.c_str(), &status);
316
317 if (status != U_ZERO_ERROR && status != U_AMBIGUOUS_ALIAS_WARNING) {
318 fprintf(stderr, "liblcf: ucnv_open() error for dest encoding \"%s\": %s\n", dst_enc_str.c_str(), u_errorName(status));
319 return std::string();
320 }
321 auto conv_to_sg = makeScopeGuard([&]() { ucnv_close(conv_to); });
322 status = U_ZERO_ERROR;
323
324 std::string result(str_to_encode.size() * 4, '\0');
325 auto* src = str_to_encode.data();
326 auto* dst = &result.front();
327
328 ucnv_convertEx(conv_to, conv_from,
329 &dst, dst + result.size(),
330 &src, src + str_to_encode.size(),
331 nullptr, nullptr, nullptr, nullptr,
332 true, true,
333 &status);
334
335 if (U_FAILURE(status)) {
336 fprintf(stderr, "liblcf: ucnv_convertEx() error when encoding \"%.*s\": %s\n", (int)str_to_encode.length(), str_to_encode.data(), u_errorName(status));
337 return std::string();
338 }
339
340 result.resize(dst - result.c_str());
341 result.shrink_to_fit();
342
343 return result;
344#else
345 iconv_t cd = iconv_open(dst_enc_str.c_str(), src_enc_str.c_str());
346 if (cd == (iconv_t)-1)
347 return ToString(str_to_encode);
348 char *src = const_cast<char *>(str_to_encode.data());
349 size_t src_left = str_to_encode.size();
350 size_t dst_size = str_to_encode.size() * 5 + 10;
351 char *dst = new char[dst_size];
352 size_t dst_left = dst_size;
353# ifdef ICONV_CONST
354 char ICONV_CONST *p = src;
355# else
356 char *p = src;
357# endif
358 char *q = dst;
359 size_t status = iconv(cd, &p, &src_left, &q, &dst_left);
360 iconv_close(cd);
361 if (status == (size_t) -1 || src_left > 0) {
362 delete[] dst;
363 return std::string();
364 }
365 *q++ = '\0';
366 std::string result(dst);
367 delete[] dst;
368 return result;
369#endif
370}
371
372std::string ReaderUtil::Normalize(StringView str) {
373#if LCF_SUPPORT_ICU
374 icu::UnicodeString uni = icu::UnicodeString(str.data(), str.length(), "utf-8").toLower(icu::Locale::getRoot());
375 UErrorCode err = U_ZERO_ERROR;
376 std::string res;
377 const icu::Normalizer2* norm = icu::Normalizer2::getNFKCInstance(err);
378 if (U_FAILURE(err)) {
379 static bool err_reported = false;
380 if (!err_reported) {
381 fprintf(stderr, "Normalizer2::getNFKCInstance failed (%s). \"nrm\" is probably missing in the ICU data file. Unicode normalization will not work!\n", u_errorName(err));
382 err_reported = true;
383 }
384 uni.toUTF8String(res);
385 return res;
386 }
387 icu::UnicodeString f = norm->normalize(uni, err);
388 if (U_FAILURE(err)) {
389 uni.toUTF8String(res);
390 } else {
391 f.toUTF8String(res);
392 }
393 return res;
394#else
395 auto result = std::string(str);
396 std::transform(result.begin(), result.end(), result.begin(), tolower);
397 return result;
398#endif
399}
400
401} //namespace lcf
Definition: dbarray.cpp:13