Language.java |
1 /** 2 * Distribution License: 3 * JSword is free software; you can redistribute it and/or modify it under 4 * the terms of the GNU Lesser General Public License, version 2.1 or later 5 * as published by the Free Software Foundation. This program is distributed 6 * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even 7 * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. 8 * See the GNU Lesser General Public License for more details. 9 * 10 * The License is available on the internet at: 11 * http://www.gnu.org/copyleft/lgpl.html 12 * or by writing to: 13 * Free Software Foundation, Inc. 14 * 59 Temple Place - Suite 330 15 * Boston, MA 02111-1307, USA 16 * 17 * © CrossWire Bible Society, 2007 - 2016 18 * 19 */ 20 package org.crosswire.common.util; 21 22 import java.util.Locale; 23 24 /** 25 * An immutable Language by specification. The specifier consists of up to three parts: 26 * <ul> 27 * <li>LL - An iso639-2 or iso639-3 language code</li> 28 * <li>SSSS - A 4-letter iso15924 script code</li> 29 * <li>CC - A 2-letter iso3166 country code</li> 30 * </ul> 31 * Note: This is a subset of the BCP-47 standard. 32 * 33 * @see gnu.lgpl.License The GNU Lesser General Public License for details. 34 * @author DM Smith 35 */ 36 public class Language implements Comparable<Language> { 37 /** 38 * The default language code is en for English. 39 */ 40 public static final String DEFAULT_LANG_CODE = "en"; 41 42 /** 43 * The language code for invalid language specifications is und for Undetermined. 44 */ 45 public static final String UNKNOWN_LANG_CODE = "und"; 46 47 /** 48 * The default language is English. 49 */ 50 public static final Language DEFAULT_LANG = new Language(DEFAULT_LANG_CODE); 51 52 53 /** 54 * A single language defined by an ISO-639 code. If the code is null or 55 * empty then it is considered to be DEFAULT_LANG (that is, English). 56 * 57 * @param specification 58 * the specifier for the particular language 59 */ 60 public Language(String specification) { 61 given = specification; 62 parse(given); 63 } 64 65 /** 66 * The specification that was given might not be be the one that 67 * ultimately gets the name. 68 * 69 * @return the specification that was originally given. 70 */ 71 public String getGivenSpecification() { 72 return given; 73 } 74 75 /** 76 * The specification that was given might not be be the one that 77 * ultimately gets the name. 78 * 79 * @return the specification that was used to find the name. 80 */ 81 public String getFoundSpecification() { 82 getName(); 83 return found; 84 } 85 86 /** 87 * Determine whether this language is valid. 88 * <ul> 89 * <li>LL - An iso639-2 or iso639-3 language code</li> 90 * <li>SSSS - A 4-letter iso15924 script code</li> 91 * <li>CC - A 2-letter iso3166 country code</li> 92 * </ul> 93 * 94 * @return true if the language is valid. 95 */ 96 public boolean isValidLanguage() { 97 getName(); 98 return valid; 99 } 100 101 /** 102 * Get the iso639 language code. 103 * 104 * @return the code for the language in lower case. 105 */ 106 public String getCode() { 107 return code; 108 } 109 110 /** 111 * Get the iso15924 script for the language. May be null. 112 * 113 * @return the code for the script in Title case. 114 */ 115 public String getScript() { 116 return script; 117 } 118 119 /** 120 * Get the iso3166 script for the language. May be null. 121 * 122 * @return the code for the country in UPPER case. 123 */ 124 public String getCountry() { 125 return country; 126 } 127 128 /** 129 * Get the localized language name. 130 * 131 * @return the name of the language 132 */ 133 public String getName() { 134 // Note: This is not quite thread safe. Unless name is volatile. 135 // But it will just do the work multiple times. 136 if (name == null) { 137 boolean more = true; 138 // Code is the ultimate fallback 139 String result = code; 140 String lookup = code; 141 142 StringBuilder sb = new StringBuilder(); 143 // The lookup is as follows. 144 // There is always a code 145 // If all parts are specified then use that 146 if (script != null && country != null) { 147 sb.append(code); 148 sb.append('-'); 149 sb.append(script); 150 sb.append('-'); 151 sb.append(country); 152 lookup = sb.toString(); 153 result = Languages.getName(lookup); 154 more = lookup.equals(result); 155 } 156 157 // If script is specified it has precedence over country 158 if (more && script != null) { 159 sb.setLength(0); 160 sb.append(code); 161 sb.append('-'); 162 sb.append(script); 163 lookup = sb.toString(); 164 result = Languages.getName(lookup); 165 more = lookup.equals(result); 166 } 167 168 // If country was specified, check for that now. 169 if (more && country != null) { 170 sb.setLength(0); 171 sb.append(code); 172 sb.append('-'); 173 sb.append(country); 174 lookup = sb.toString(); 175 result = Languages.getName(lookup); 176 more = lookup.equals(result); 177 } 178 179 // Now check just the code. 180 if (more) { 181 lookup = code; 182 result = Languages.getName(lookup); 183 more = lookup.equals(result); 184 } 185 186 // Oops, the localized lookup failed. 187 // See if Java has one. 188 if (more) { 189 lookup = code; 190 result = new Locale(lookup).getDisplayLanguage(); 191 more = lookup.equals(result); 192 } 193 194 // Oops, Java doesn't have a clue 195 // Look into our heavy handed listing 196 if (more) { 197 lookup = code; 198 result = Languages.AllLanguages.getName(lookup); 199 more = lookup.equals(result); 200 } 201 202 // Oops, didn't find it anywhere. Mark it as invalid. 203 if (more) { 204 valid = false; 205 } 206 // now that we are here go with what we last used and got 207 found = lookup; 208 // Assign name last to help with synchronization issues 209 name = result; 210 } 211 return name; 212 } 213 214 /** 215 * Determine whether this language is a Left-to-Right or a Right-to-Left 216 * language. If the language has a script, it is used for the determination. 217 * Otherwise, check the language. 218 * <p> 219 * Note: This is problematic. Languages do not have direction. 220 * Scripts do. Further, there are over 7000 living languages, many of which 221 * are written in Right-to-Left scripts and are not listed here. 222 * </p> 223 * 224 * @return true if the language is Left-to-Right. 225 */ 226 public boolean isLeftToRight() { 227 if (!knowsDirection) { 228 ltor = !Languages.RtoL.isRtoL(script, code); 229 knowsDirection = true; 230 } 231 return ltor; 232 } 233 234 /* (non-Javadoc) 235 * @see java.lang.Object#hashCode() 236 */ 237 @Override 238 public int hashCode() { 239 if (found == null) { 240 getName(); 241 } 242 return found.hashCode(); 243 } 244 245 /* (non-Javadoc) 246 * @see java.lang.Object#equals(java.lang.Object) 247 */ 248 @Override 249 public boolean equals(Object obj) { 250 if (this == obj) { 251 return true; 252 } 253 254 if (obj == null || getClass() != obj.getClass()) { 255 return false; 256 } 257 258 final Language other = (Language) obj; 259 260 return code.equals(other.code) && compareStrings(script, other.script) && compareStrings(country, other.country); 261 } 262 263 /* (non-Javadoc) 264 * @see java.lang.Object#toString() 265 */ 266 @Override 267 public String toString() { 268 return getName(); 269 } 270 271 /* (non-Javadoc) 272 * @see java.lang.Comparable#compareTo(java.lang.Object) 273 */ 274 public int compareTo(Language o) { 275 return getName().compareTo(o.getName()); 276 } 277 278 /** 279 * Split the specification on '-' into 1 to 3 parts. 280 * 281 * @param spec the specification to parse 282 */ 283 private void parse(String spec) { 284 String specification = spec; 285 if (specification == null) { 286 specification = DEFAULT_LANG_CODE; 287 } 288 289 int len = specification.length(); 290 291 // It used to be that SWORD modules used x- and X- as a language prefix 292 // for minority languages. Now that we have a complete iso639 spec, 293 // SWORD does not use it. 294 if (len < 2 || specification.charAt(0) == '-' || specification.charAt(1) == '-') { 295 valid = false; 296 code = UNKNOWN_LANG_CODE; 297 return; 298 } 299 300 // Obvious optimization of the most common case: only the language code is given 301 if (len <= 3) { 302 code = CanonicalUtils.getLanguage(specification, 0, len); 303 } 304 305 int partLen = 0; 306 int start = 0; 307 int split; 308 for (split = 2; split < len; ++split) { 309 char c = specification.charAt(split); 310 if (c == '-') { 311 break; 312 } 313 } 314 code = CanonicalUtils.getLanguage(specification, start, split); 315 partLen = split - start; 316 valid = partLen == 2 || partLen == 3; 317 start = split + 1; 318 319 // Get the second part. It is either a script or a country code 320 if (split < len) { 321 for (split = start; split < len; ++split) { 322 char c = specification.charAt(split); 323 if (c == '-') { 324 break; 325 } 326 } 327 partLen = split - start; 328 if (partLen == 4) { 329 script = CanonicalUtils.getScript(specification, start, split); 330 } else if (partLen == 2) { 331 country = CanonicalUtils.getCountry(specification, start, split); 332 } else { 333 valid = false; 334 } 335 start = split + 1; 336 } 337 338 // Get the third part, if any. It can only be a country code. 339 if (country == null && split < len) { 340 for (split = start; split < len; ++split) { 341 char c = specification.charAt(split); 342 if (c == '-') { 343 break; 344 } 345 } 346 partLen = split - start; 347 if (partLen == 2) { 348 country = CanonicalUtils.getCountry(specification, start, split); 349 } else { 350 valid = false; 351 } 352 start = split + 1; 353 } 354 355 if (start <= len) { 356 valid = false; 357 } 358 } 359 360 /** 361 * Equal if both a and b are the same. 362 * 363 * @param a a string to compare 364 * @param b a string to compare 365 * @return true if both are the same. 366 */ 367 private boolean compareStrings(String a, String b) { 368 return (a == null && b == null) || (a != null && a.equals(b)); 369 } 370 371 /** 372 * Converts substrings to the canonical representation for language code, script and country. 373 */ 374 private static final class CanonicalUtils { 375 /** 376 * Utility class. Private constructor. 377 */ 378 private CanonicalUtils() { 379 } 380 381 /** 382 * The iso639 language code's canonical form is lower case. 383 * 384 * @param specification 385 * the bcp47 specification of the language 386 * @param start 387 * the start of the code 388 * @param end 389 * the position of the character following the code 390 * @return the canonical representation for the code 391 */ 392 public static String getLanguage(String specification, int start, int end) { 393 394 // An empty string means no work 395 if (start == end) { 396 return null; 397 } 398 399 // Avoid construction by analyzing the string 400 // to see if it is already LanguageCase. 401 // Find the first character that is not LanguageCase 402 int first; 403 for (first = start; first < end && isLowerASCII(specification.charAt(first)); ++first) { 404 continue; // keep going 405 } 406 407 // If we get to the end of the string then it is CountryCase 408 if (first == end) { 409 return specification.substring(start, end); 410 } 411 412 // Bummer, we need to do work 413 int len = end - start; 414 char[] buf = new char[len]; 415 int i = 0; 416 for (int j = start; j < end; ++j) { 417 buf[i++] = j < first ? specification.charAt(j) : toLowerASCII(specification.charAt(j)); 418 } 419 return new String(buf); 420 } 421 422 /** 423 * The iso3166 country code's canonical form is upper case. 424 * 425 * @param specification 426 * the bcp47 specification of the language 427 * @param start 428 * the start of the code 429 * @param end 430 * the position of the character following the code 431 * @return the canonical representation for the code 432 */ 433 public static String getCountry(String specification, int start, int end) { 434 435 // An empty string means no work 436 if (start == end) { 437 return null; 438 } 439 440 // Avoid construction by analyzing the string 441 // to see if it is already CountryCase. 442 // Find the first character that is not CountryCase 443 int first; 444 for (first = start; first < end && isUpperASCII(specification.charAt(first)); ++first) { 445 continue; // keep going 446 } 447 448 // If we get to the end of the string then it is CountryCase 449 if (first == end) { 450 return specification.substring(start, end); 451 } 452 453 // Bummer, we need to do work 454 int len = end - start; 455 char[] buf = new char[len]; 456 int i = 0; 457 for (int j = start; j < end; ++j) { 458 buf[i++] = j < first ? specification.charAt(j) : toUpperASCII(specification.charAt(j)); 459 } 460 return new String(buf); 461 } 462 463 /** 464 * The iso15924 script code's canonical form is title case. 465 * 466 * @param specification 467 * the bcp47 specification of the language 468 * @param start 469 * the start of the code 470 * @param end 471 * the position of the character following the code 472 * @return the canonical representation for the code 473 */ 474 public static String getScript(String specification, int start, int end) { 475 476 // An empty string means no work 477 if (start == end) { 478 return null; 479 } 480 481 // Avoid construction by analyzing the string 482 // to see if it is already ScriptCase. 483 // Find the first character that is not ScriptCase 484 int first = start; 485 if (isUpperASCII(specification.charAt(start))) { 486 for (first = start + 1; first < end && isLowerASCII(specification.charAt(first)); ++first) { 487 continue; // keep going 488 } 489 490 // If we get to the end of the string then it is ScriptCase 491 if (first == end) { 492 return specification.substring(start, end); 493 } 494 } 495 496 // Bummer, we need to do work. 497 int len = end - start; 498 char[] buf = new char[len]; 499 buf[0] = first == start ? toUpperASCII(specification.charAt(first)) : specification.charAt(first); 500 int i = 1; 501 for (int j = start + 1; j < end; ++j) { 502 buf[i++] = j < first ? specification.charAt(j) : toLowerASCII(specification.charAt(j)); 503 } 504 return new String(buf); 505 } 506 507 /** 508 * Determine whether the character is one of A-Z. 509 * 510 * @param c the character to examine 511 * @return true if it is in A-Z 512 */ 513 private static boolean isUpperASCII(char c) { 514 return c >= 'A' && c <= 'Z'; 515 } 516 517 /** 518 * Determine whether the character is one of a-z. 519 * 520 * @param c the character to examine 521 * @return true if it is in a-z 522 */ 523 private static boolean isLowerASCII(char c) { 524 return c >= 'a' && c <= 'z'; 525 } 526 527 /** 528 * Convert a character, in in a-z to its upper case value, otherwise leave it alone. 529 * 530 * @param c the character to convert, if in a-z 531 * @return the upper case ASCII representation of the character or the character itself. 532 */ 533 private static char toUpperASCII(char c) { 534 return isLowerASCII(c) ? (char) (c - 32) : c; 535 } 536 537 /** 538 * Convert a character, in in A-Z to its lower case value, otherwise leave it alone. 539 * 540 * @param c the character to convert, if in A-Z 541 * @return the lower case ASCII representation of the character or the character itself. 542 */ 543 private static char toLowerASCII(char c) { 544 return isUpperASCII(c) ? (char) (c + 32) : c; 545 } 546 } 547 548 /** 549 * The original specification provided by the user. 550 */ 551 private String given; 552 /** 553 * The effective specification. 554 */ 555 private String found; 556 /** 557 * The lower case iso639 language code. 558 */ 559 private String code; 560 /** 561 * The Title case iso15924 script code. 562 */ 563 private String script; 564 /** 565 * The UPPER case iso3166 country code. 566 */ 567 private String country; 568 /** 569 * The name as defined by Languages. 570 */ 571 private String name; 572 /** 573 * Flag to store whether the code is valid. 574 */ 575 private boolean valid; 576 private boolean knowsDirection; 577 private boolean ltor; 578 } 579