| Language.java |
1 /**
2 * Distribution License:
3 * JSword is free software; you can redistribute it and/or modify it under
4 * the terms of the GNU Lesser General Public License, version 2.1 or later
5 * as published by the Free Software Foundation. This program is distributed
6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8 * See the GNU Lesser General Public License for more details.
9 *
10 * The License is available on the internet at:
11 * http://www.gnu.org/copyleft/lgpl.html
12 * or by writing to:
13 * Free Software Foundation, Inc.
14 * 59 Temple Place - Suite 330
15 * Boston, MA 02111-1307, USA
16 *
17 * © CrossWire Bible Society, 2007 - 2016
18 *
19 */
20 package org.crosswire.common.util;
21
22 import java.util.Locale;
23
24 /**
25 * An immutable Language by specification. The specifier consists of up to three parts:
26 * <ul>
27 * <li>LL - An iso639-2 or iso639-3 language code</li>
28 * <li>SSSS - A 4-letter iso15924 script code</li>
29 * <li>CC - A 2-letter iso3166 country code</li>
30 * </ul>
31 * Note: This is a subset of the BCP-47 standard.
32 *
33 * @see gnu.lgpl.License The GNU Lesser General Public License for details.
34 * @author DM Smith
35 */
36 public class Language implements Comparable<Language> {
37 /**
38 * The default language code is en for English.
39 */
40 public static final String DEFAULT_LANG_CODE = "en";
41
42 /**
43 * The language code for invalid language specifications is und for Undetermined.
44 */
45 public static final String UNKNOWN_LANG_CODE = "und";
46
47 /**
48 * The default language is English.
49 */
50 public static final Language DEFAULT_LANG = new Language(DEFAULT_LANG_CODE);
51
52
53 /**
54 * A single language defined by an ISO-639 code. If the code is null or
55 * empty then it is considered to be DEFAULT_LANG (that is, English).
56 *
57 * @param specification
58 * the specifier for the particular language
59 */
60 public Language(String specification) {
61 given = specification;
62 parse(given);
63 }
64
65 /**
66 * The specification that was given might not be be the one that
67 * ultimately gets the name.
68 *
69 * @return the specification that was originally given.
70 */
71 public String getGivenSpecification() {
72 return given;
73 }
74
75 /**
76 * The specification that was given might not be be the one that
77 * ultimately gets the name.
78 *
79 * @return the specification that was used to find the name.
80 */
81 public String getFoundSpecification() {
82 getName();
83 return found;
84 }
85
86 /**
87 * Determine whether this language is valid.
88 * <ul>
89 * <li>LL - An iso639-2 or iso639-3 language code</li>
90 * <li>SSSS - A 4-letter iso15924 script code</li>
91 * <li>CC - A 2-letter iso3166 country code</li>
92 * </ul>
93 *
94 * @return true if the language is valid.
95 */
96 public boolean isValidLanguage() {
97 getName();
98 return valid;
99 }
100
101 /**
102 * Get the iso639 language code.
103 *
104 * @return the code for the language in lower case.
105 */
106 public String getCode() {
107 return code;
108 }
109
110 /**
111 * Get the iso15924 script for the language. May be null.
112 *
113 * @return the code for the script in Title case.
114 */
115 public String getScript() {
116 return script;
117 }
118
119 /**
120 * Get the iso3166 script for the language. May be null.
121 *
122 * @return the code for the country in UPPER case.
123 */
124 public String getCountry() {
125 return country;
126 }
127
128 /**
129 * Get the localized language name.
130 *
131 * @return the name of the language
132 */
133 public String getName() {
134 // Note: This is not quite thread safe. Unless name is volatile.
135 // But it will just do the work multiple times.
136 if (name == null) {
137 boolean more = true;
138 // Code is the ultimate fallback
139 String result = code;
140 String lookup = code;
141
142 StringBuilder sb = new StringBuilder();
143 // The lookup is as follows.
144 // There is always a code
145 // If all parts are specified then use that
146 if (script != null && country != null) {
147 sb.append(code);
148 sb.append('-');
149 sb.append(script);
150 sb.append('-');
151 sb.append(country);
152 lookup = sb.toString();
153 result = Languages.getName(lookup);
154 more = lookup.equals(result);
155 }
156
157 // If script is specified it has precedence over country
158 if (more && script != null) {
159 sb.setLength(0);
160 sb.append(code);
161 sb.append('-');
162 sb.append(script);
163 lookup = sb.toString();
164 result = Languages.getName(lookup);
165 more = lookup.equals(result);
166 }
167
168 // If country was specified, check for that now.
169 if (more && country != null) {
170 sb.setLength(0);
171 sb.append(code);
172 sb.append('-');
173 sb.append(country);
174 lookup = sb.toString();
175 result = Languages.getName(lookup);
176 more = lookup.equals(result);
177 }
178
179 // Now check just the code.
180 if (more) {
181 lookup = code;
182 result = Languages.getName(lookup);
183 more = lookup.equals(result);
184 }
185
186 // Oops, the localized lookup failed.
187 // See if Java has one.
188 if (more) {
189 lookup = code;
190 result = new Locale(lookup).getDisplayLanguage();
191 more = lookup.equals(result);
192 }
193
194 // Oops, Java doesn't have a clue
195 // Look into our heavy handed listing
196 if (more) {
197 lookup = code;
198 result = Languages.AllLanguages.getName(lookup);
199 more = lookup.equals(result);
200 }
201
202 // Oops, didn't find it anywhere. Mark it as invalid.
203 if (more) {
204 valid = false;
205 }
206 // now that we are here go with what we last used and got
207 found = lookup;
208 // Assign name last to help with synchronization issues
209 name = result;
210 }
211 return name;
212 }
213
214 /**
215 * Determine whether this language is a Left-to-Right or a Right-to-Left
216 * language. If the language has a script, it is used for the determination.
217 * Otherwise, check the language.
218 * <p>
219 * Note: This is problematic. Languages do not have direction.
220 * Scripts do. Further, there are over 7000 living languages, many of which
221 * are written in Right-to-Left scripts and are not listed here.
222 * </p>
223 *
224 * @return true if the language is Left-to-Right.
225 */
226 public boolean isLeftToRight() {
227 if (!knowsDirection) {
228 ltor = !Languages.RtoL.isRtoL(script, code);
229 knowsDirection = true;
230 }
231 return ltor;
232 }
233
234 /* (non-Javadoc)
235 * @see java.lang.Object#hashCode()
236 */
237 @Override
238 public int hashCode() {
239 if (found == null) {
240 getName();
241 }
242 return found.hashCode();
243 }
244
245 /* (non-Javadoc)
246 * @see java.lang.Object#equals(java.lang.Object)
247 */
248 @Override
249 public boolean equals(Object obj) {
250 if (this == obj) {
251 return true;
252 }
253
254 if (obj == null || getClass() != obj.getClass()) {
255 return false;
256 }
257
258 final Language other = (Language) obj;
259
260 return code.equals(other.code) && compareStrings(script, other.script) && compareStrings(country, other.country);
261 }
262
263 /* (non-Javadoc)
264 * @see java.lang.Object#toString()
265 */
266 @Override
267 public String toString() {
268 return getName();
269 }
270
271 /* (non-Javadoc)
272 * @see java.lang.Comparable#compareTo(java.lang.Object)
273 */
274 public int compareTo(Language o) {
275 return getName().compareTo(o.getName());
276 }
277
278 /**
279 * Split the specification on '-' into 1 to 3 parts.
280 *
281 * @param spec the specification to parse
282 */
283 private void parse(String spec) {
284 String specification = spec;
285 if (specification == null) {
286 specification = DEFAULT_LANG_CODE;
287 }
288
289 int len = specification.length();
290
291 // It used to be that SWORD modules used x- and X- as a language prefix
292 // for minority languages. Now that we have a complete iso639 spec,
293 // SWORD does not use it.
294 if (len < 2 || specification.charAt(0) == '-' || specification.charAt(1) == '-') {
295 valid = false;
296 code = UNKNOWN_LANG_CODE;
297 return;
298 }
299
300 // Obvious optimization of the most common case: only the language code is given
301 if (len <= 3) {
302 code = CanonicalUtils.getLanguage(specification, 0, len);
303 }
304
305 int partLen = 0;
306 int start = 0;
307 int split;
308 for (split = 2; split < len; ++split) {
309 char c = specification.charAt(split);
310 if (c == '-') {
311 break;
312 }
313 }
314 code = CanonicalUtils.getLanguage(specification, start, split);
315 partLen = split - start;
316 valid = partLen == 2 || partLen == 3;
317 start = split + 1;
318
319 // Get the second part. It is either a script or a country code
320 if (split < len) {
321 for (split = start; split < len; ++split) {
322 char c = specification.charAt(split);
323 if (c == '-') {
324 break;
325 }
326 }
327 partLen = split - start;
328 if (partLen == 4) {
329 script = CanonicalUtils.getScript(specification, start, split);
330 } else if (partLen == 2) {
331 country = CanonicalUtils.getCountry(specification, start, split);
332 } else {
333 valid = false;
334 }
335 start = split + 1;
336 }
337
338 // Get the third part, if any. It can only be a country code.
339 if (country == null && split < len) {
340 for (split = start; split < len; ++split) {
341 char c = specification.charAt(split);
342 if (c == '-') {
343 break;
344 }
345 }
346 partLen = split - start;
347 if (partLen == 2) {
348 country = CanonicalUtils.getCountry(specification, start, split);
349 } else {
350 valid = false;
351 }
352 start = split + 1;
353 }
354
355 if (start <= len) {
356 valid = false;
357 }
358 }
359
360 /**
361 * Equal if both a and b are the same.
362 *
363 * @param a a string to compare
364 * @param b a string to compare
365 * @return true if both are the same.
366 */
367 private boolean compareStrings(String a, String b) {
368 return (a == null && b == null) || (a != null && a.equals(b));
369 }
370
371 /**
372 * Converts substrings to the canonical representation for language code, script and country.
373 */
374 private static final class CanonicalUtils {
375 /**
376 * Utility class. Private constructor.
377 */
378 private CanonicalUtils() {
379 }
380
381 /**
382 * The iso639 language code's canonical form is lower case.
383 *
384 * @param specification
385 * the bcp47 specification of the language
386 * @param start
387 * the start of the code
388 * @param end
389 * the position of the character following the code
390 * @return the canonical representation for the code
391 */
392 public static String getLanguage(String specification, int start, int end) {
393
394 // An empty string means no work
395 if (start == end) {
396 return null;
397 }
398
399 // Avoid construction by analyzing the string
400 // to see if it is already LanguageCase.
401 // Find the first character that is not LanguageCase
402 int first;
403 for (first = start; first < end && isLowerASCII(specification.charAt(first)); ++first) {
404 continue; // keep going
405 }
406
407 // If we get to the end of the string then it is CountryCase
408 if (first == end) {
409 return specification.substring(start, end);
410 }
411
412 // Bummer, we need to do work
413 int len = end - start;
414 char[] buf = new char[len];
415 int i = 0;
416 for (int j = start; j < end; ++j) {
417 buf[i++] = j < first ? specification.charAt(j) : toLowerASCII(specification.charAt(j));
418 }
419 return new String(buf);
420 }
421
422 /**
423 * The iso3166 country code's canonical form is upper case.
424 *
425 * @param specification
426 * the bcp47 specification of the language
427 * @param start
428 * the start of the code
429 * @param end
430 * the position of the character following the code
431 * @return the canonical representation for the code
432 */
433 public static String getCountry(String specification, int start, int end) {
434
435 // An empty string means no work
436 if (start == end) {
437 return null;
438 }
439
440 // Avoid construction by analyzing the string
441 // to see if it is already CountryCase.
442 // Find the first character that is not CountryCase
443 int first;
444 for (first = start; first < end && isUpperASCII(specification.charAt(first)); ++first) {
445 continue; // keep going
446 }
447
448 // If we get to the end of the string then it is CountryCase
449 if (first == end) {
450 return specification.substring(start, end);
451 }
452
453 // Bummer, we need to do work
454 int len = end - start;
455 char[] buf = new char[len];
456 int i = 0;
457 for (int j = start; j < end; ++j) {
458 buf[i++] = j < first ? specification.charAt(j) : toUpperASCII(specification.charAt(j));
459 }
460 return new String(buf);
461 }
462
463 /**
464 * The iso15924 script code's canonical form is title case.
465 *
466 * @param specification
467 * the bcp47 specification of the language
468 * @param start
469 * the start of the code
470 * @param end
471 * the position of the character following the code
472 * @return the canonical representation for the code
473 */
474 public static String getScript(String specification, int start, int end) {
475
476 // An empty string means no work
477 if (start == end) {
478 return null;
479 }
480
481 // Avoid construction by analyzing the string
482 // to see if it is already ScriptCase.
483 // Find the first character that is not ScriptCase
484 int first = start;
485 if (isUpperASCII(specification.charAt(start))) {
486 for (first = start + 1; first < end && isLowerASCII(specification.charAt(first)); ++first) {
487 continue; // keep going
488 }
489
490 // If we get to the end of the string then it is ScriptCase
491 if (first == end) {
492 return specification.substring(start, end);
493 }
494 }
495
496 // Bummer, we need to do work.
497 int len = end - start;
498 char[] buf = new char[len];
499 buf[0] = first == start ? toUpperASCII(specification.charAt(first)) : specification.charAt(first);
500 int i = 1;
501 for (int j = start + 1; j < end; ++j) {
502 buf[i++] = j < first ? specification.charAt(j) : toLowerASCII(specification.charAt(j));
503 }
504 return new String(buf);
505 }
506
507 /**
508 * Determine whether the character is one of A-Z.
509 *
510 * @param c the character to examine
511 * @return true if it is in A-Z
512 */
513 private static boolean isUpperASCII(char c) {
514 return c >= 'A' && c <= 'Z';
515 }
516
517 /**
518 * Determine whether the character is one of a-z.
519 *
520 * @param c the character to examine
521 * @return true if it is in a-z
522 */
523 private static boolean isLowerASCII(char c) {
524 return c >= 'a' && c <= 'z';
525 }
526
527 /**
528 * Convert a character, in in a-z to its upper case value, otherwise leave it alone.
529 *
530 * @param c the character to convert, if in a-z
531 * @return the upper case ASCII representation of the character or the character itself.
532 */
533 private static char toUpperASCII(char c) {
534 return isLowerASCII(c) ? (char) (c - 32) : c;
535 }
536
537 /**
538 * Convert a character, in in A-Z to its lower case value, otherwise leave it alone.
539 *
540 * @param c the character to convert, if in A-Z
541 * @return the lower case ASCII representation of the character or the character itself.
542 */
543 private static char toLowerASCII(char c) {
544 return isUpperASCII(c) ? (char) (c + 32) : c;
545 }
546 }
547
548 /**
549 * The original specification provided by the user.
550 */
551 private String given;
552 /**
553 * The effective specification.
554 */
555 private String found;
556 /**
557 * The lower case iso639 language code.
558 */
559 private String code;
560 /**
561 * The Title case iso15924 script code.
562 */
563 private String script;
564 /**
565 * The UPPER case iso3166 country code.
566 */
567 private String country;
568 /**
569 * The name as defined by Languages.
570 */
571 private String name;
572 /**
573 * Flag to store whether the code is valid.
574 */
575 private boolean valid;
576 private boolean knowsDirection;
577 private boolean ltor;
578 }
579