1   /**
2    * Distribution License:
3    * JSword is free software; you can redistribute it and/or modify it under
4    * the terms of the GNU Lesser General Public License, version 2.1 or later
5    * as published by the Free Software Foundation. This program is distributed
6    * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
7    * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
8    * See the GNU Lesser General Public License for more details.
9    *
10   * The License is available on the internet at:
11   *      http://www.gnu.org/copyleft/lgpl.html
12   * or by writing to:
13   *      Free Software Foundation, Inc.
14   *      59 Temple Place - Suite 330
15   *      Boston, MA 02111-1307, USA
16   *
17   * © CrossWire Bible Society, 2007 - 2016
18   *
19   */
20  package org.crosswire.common.util;
21  
22  import java.util.Locale;
23  
24  /**
25   * An immutable Language by specification. The specifier consists of up to three parts:
26   * <ul>
27   * <li>LL - An iso639-2 or iso639-3 language code</li>
28   * <li>SSSS - A 4-letter iso15924 script code</li>
29   * <li>CC - A 2-letter iso3166 country code</li>
30   * </ul>
31   * Note: This is a subset of the BCP-47 standard.
32   * 
33   * @see gnu.lgpl.License The GNU Lesser General Public License for details.
34   * @author DM Smith
35   */
36  public class Language implements Comparable<Language> {
37      /**
38       * The default language code is en for English.
39       */
40      public static final String DEFAULT_LANG_CODE = "en";
41  
42      /**
43       * The language code for invalid language specifications is und for Undetermined.
44       */
45      public static final String UNKNOWN_LANG_CODE = "und";
46  
47      /**
48       * The default language is English.
49       */
50      public static final Language DEFAULT_LANG = new Language(DEFAULT_LANG_CODE);
51  
52  
53      /**
54       * A single language defined by an ISO-639 code. If the code is null or
55       * empty then it is considered to be DEFAULT_LANG (that is, English).
56       * 
57       * @param specification
58       *            the specifier for the particular language
59       */
60      public Language(String specification) {
61          given = specification;
62          parse(given);
63      }
64  
65      /**
66       * The specification that was given might not be be the one that
67       * ultimately gets the name.
68       * 
69       * @return the specification that was originally given.
70       */
71      public String getGivenSpecification() {
72          return given;
73      }
74  
75      /**
76       * The specification that was given might not be be the one that
77       * ultimately gets the name.
78       * 
79       * @return the specification that was used to find the name.
80       */
81      public String getFoundSpecification() {
82          getName();
83          return found;
84      }
85  
86      /**
87       * Determine whether this language is valid.
88       * <ul>
89       * <li>LL - An iso639-2 or iso639-3 language code</li>
90       * <li>SSSS - A 4-letter iso15924 script code</li>
91       * <li>CC - A 2-letter iso3166 country code</li>
92       * </ul>
93       * 
94       * @return true if the language is valid.
95       */
96      public boolean isValidLanguage() {
97          getName();
98          return valid;
99      }
100 
101     /**
102      * Get the iso639 language code.
103      * 
104      * @return the code for the language in lower case.
105      */
106     public String getCode() {
107         return code;
108     }
109 
110     /**
111      * Get the iso15924 script for the language. May be null.
112      * 
113      * @return the code for the script in Title case.
114      */
115     public String getScript() {
116         return script;
117     }
118 
119     /**
120      * Get the iso3166 script for the language. May be null.
121      * 
122      * @return the code for the country in UPPER case.
123      */
124     public String getCountry() {
125         return country;
126     }
127 
128     /**
129      * Get the localized language name.
130      * 
131      * @return the name of the language
132      */
133     public String getName() {
134         // Note: This is not quite thread safe. Unless name is volatile.
135         // But it will just do the work multiple times.
136         if (name == null) {
137             boolean more = true;
138             // Code is the ultimate fallback
139             String result = code;
140             String lookup = code;
141 
142             StringBuilder sb = new StringBuilder();
143             // The lookup is as follows.
144             // There is always a code
145             // If all parts are specified then use that
146             if (script != null && country != null) {
147                 sb.append(code);
148                 sb.append('-');
149                 sb.append(script);
150                 sb.append('-');
151                 sb.append(country);
152                 lookup = sb.toString();
153                 result = Languages.getName(lookup);
154                 more = lookup.equals(result);
155             }
156 
157             // If script is specified it has precedence over country
158             if (more && script != null) {
159                 sb.setLength(0);
160                 sb.append(code);
161                 sb.append('-');
162                 sb.append(script);
163                 lookup = sb.toString();
164                 result = Languages.getName(lookup);
165                 more = lookup.equals(result);
166             }
167 
168             // If country was specified, check for that now.
169             if (more && country != null) {
170                 sb.setLength(0);
171                 sb.append(code);
172                 sb.append('-');
173                 sb.append(country);
174                 lookup = sb.toString();
175                 result = Languages.getName(lookup);
176                 more = lookup.equals(result);
177             }
178 
179             // Now check just the code.
180             if (more) {
181                 lookup = code;
182                 result = Languages.getName(lookup);
183                 more = lookup.equals(result);
184             }
185 
186             // Oops, the localized lookup failed.
187             // See if Java has one.
188             if (more) {
189                 lookup = code;
190                 result = new Locale(lookup).getDisplayLanguage();
191                 more = lookup.equals(result);
192             }
193 
194             // Oops, Java doesn't have a clue
195             // Look into our heavy handed listing
196             if (more) {
197                 lookup = code;
198                 result = Languages.AllLanguages.getName(lookup);
199                 more = lookup.equals(result);
200             }
201 
202             // Oops, didn't find it anywhere. Mark it as invalid.
203             if (more) {
204                 valid = false;
205             }
206             // now that we are here go with what we last used and got
207             found = lookup;
208             // Assign name last to help with synchronization issues
209             name = result;
210         }
211         return name;
212     }
213 
214     /**
215      * Determine whether this language is a Left-to-Right or a Right-to-Left
216      * language. If the language has a script, it is used for the determination.
217      * Otherwise, check the language.
218      * <p>
219      * Note: This is problematic. Languages do not have direction.
220      * Scripts do. Further, there are over 7000 living languages, many of which
221      * are written in Right-to-Left scripts and are not listed here.
222      * </p>
223      * 
224      * @return true if the language is Left-to-Right.
225      */
226     public boolean isLeftToRight() {
227         if (!knowsDirection) {
228             ltor = !Languages.RtoL.isRtoL(script, code);
229             knowsDirection = true;
230         }
231         return ltor;
232     }
233 
234     /* (non-Javadoc)
235      * @see java.lang.Object#hashCode()
236      */
237     @Override
238     public int hashCode() {
239         if (found == null) {
240             getName();
241         }
242         return found.hashCode();
243     }
244 
245     /* (non-Javadoc)
246      * @see java.lang.Object#equals(java.lang.Object)
247      */
248     @Override
249     public boolean equals(Object obj) {
250         if (this == obj) {
251             return true;
252         }
253 
254         if (obj == null || getClass() != obj.getClass()) {
255             return false;
256         }
257 
258         final Language other = (Language) obj;
259 
260         return code.equals(other.code)  && compareStrings(script, other.script) && compareStrings(country, other.country);
261     }
262 
263     /* (non-Javadoc)
264      * @see java.lang.Object#toString()
265      */
266     @Override
267     public String toString() {
268         return getName();
269     }
270 
271     /* (non-Javadoc)
272      * @see java.lang.Comparable#compareTo(java.lang.Object)
273      */
274     public int compareTo(Language o) {
275         return getName().compareTo(o.getName());
276     }
277 
278     /**
279      * Split the specification on '-' into 1 to 3 parts.
280      * 
281      * @param spec the specification to parse
282      */
283     private void parse(String spec) {
284         String specification = spec;
285         if (specification == null) {
286             specification = DEFAULT_LANG_CODE;
287         }
288 
289         int len = specification.length();
290 
291         // It used to be that SWORD modules used x- and X- as a language prefix
292         // for minority languages. Now that we have a complete iso639 spec,
293         // SWORD does not use it.
294         if (len < 2 || specification.charAt(0) == '-' || specification.charAt(1) == '-') {
295             valid = false;
296             code = UNKNOWN_LANG_CODE;
297             return;
298         }
299 
300         // Obvious optimization of the most common case: only the language code is given
301         if (len <= 3) {
302             code = CanonicalUtils.getLanguage(specification, 0, len);
303         }
304 
305         int partLen = 0;
306         int start = 0;
307         int split;
308         for (split = 2; split < len; ++split) {
309             char c = specification.charAt(split);
310             if (c == '-') {
311                 break;
312             }
313         }
314         code = CanonicalUtils.getLanguage(specification, start, split);
315         partLen = split - start;
316         valid = partLen == 2 || partLen == 3;
317         start = split + 1;
318 
319         // Get the second part. It is either a script or a country code
320         if (split < len) {
321             for (split = start; split < len; ++split) {
322                 char c = specification.charAt(split);
323                 if (c == '-') {
324                     break;
325                 }
326             }
327             partLen = split - start;
328             if (partLen == 4) {
329                 script = CanonicalUtils.getScript(specification, start, split);
330             } else if (partLen == 2) {
331                 country = CanonicalUtils.getCountry(specification, start, split);
332             } else {
333                 valid = false;
334             }
335             start = split + 1;
336         }
337 
338         // Get the third part, if any. It can only be a country code.
339         if (country == null && split < len) {
340             for (split = start; split < len; ++split) {
341                 char c = specification.charAt(split);
342                 if (c == '-') {
343                     break;
344                 }
345             }
346             partLen = split - start;
347             if (partLen == 2) {
348                 country = CanonicalUtils.getCountry(specification, start, split);
349             } else {
350                 valid = false;
351             }
352             start = split + 1;
353         }
354 
355         if (start <= len) {
356             valid = false;
357         }
358     }
359 
360     /**
361      * Equal if both a and b are the same.
362      * 
363      * @param a a string to compare
364      * @param b a string to compare
365      * @return true if both are the same.
366      */
367     private boolean compareStrings(String a, String b) {
368         return (a == null && b == null) || (a != null && a.equals(b));
369     }
370 
371     /**
372      * Converts substrings to the canonical representation for language code, script and country.
373      */
374     private static final class CanonicalUtils {
375         /**
376          * Utility class. Private constructor.
377          */
378         private CanonicalUtils() {
379         }
380 
381         /**
382          * The iso639 language code's canonical form is lower case.
383          * 
384          * @param specification
385          *            the bcp47 specification of the language
386          * @param start
387          *            the start of the code
388          * @param end
389          *            the position of the character following the code
390          * @return the canonical representation for the code
391          */
392         public static String getLanguage(String specification, int start, int end) {
393 
394             // An empty string means no work
395             if (start == end) {
396                 return null;
397             }
398 
399             // Avoid construction by analyzing the string
400             // to see if it is already LanguageCase.
401             // Find the first character that is not LanguageCase
402             int first;
403             for (first = start; first < end && isLowerASCII(specification.charAt(first)); ++first) {
404                 continue; // keep going
405             }
406 
407             // If we get to the end of the string then it is CountryCase
408             if (first == end) {
409                 return specification.substring(start, end);
410             }
411 
412             // Bummer, we need to do work
413             int len = end - start;
414             char[] buf = new char[len];
415             int i = 0;
416             for (int j = start; j < end; ++j) {
417                 buf[i++] = j < first ? specification.charAt(j) : toLowerASCII(specification.charAt(j));
418             }
419             return new String(buf);
420         }
421 
422         /**
423          * The iso3166 country code's canonical form is upper case.
424          * 
425          * @param specification
426          *            the bcp47 specification of the language
427          * @param start
428          *            the start of the code
429          * @param end
430          *            the position of the character following the code
431          * @return the canonical representation for the code
432          */
433         public static String getCountry(String specification, int start, int end) {
434 
435             // An empty string means no work
436             if (start == end) {
437                 return null;
438             }
439 
440             // Avoid construction by analyzing the string
441             // to see if it is already CountryCase.
442             // Find the first character that is not CountryCase
443             int first;
444             for (first = start; first < end && isUpperASCII(specification.charAt(first)); ++first) {
445                 continue; // keep going
446             }
447 
448             // If we get to the end of the string then it is CountryCase
449             if (first == end) {
450                 return specification.substring(start, end);
451             }
452 
453             // Bummer, we need to do work
454             int len = end - start;
455             char[] buf = new char[len];
456             int i = 0;
457             for (int j = start; j < end; ++j) {
458                 buf[i++] = j < first ? specification.charAt(j) : toUpperASCII(specification.charAt(j));
459             }
460             return new String(buf);
461         }
462 
463         /**
464          * The iso15924 script code's canonical form is title case.
465          * 
466          * @param specification
467          *            the bcp47 specification of the language
468          * @param start
469          *            the start of the code
470          * @param end
471          *            the position of the character following the code
472          * @return the canonical representation for the code
473          */
474         public static String getScript(String specification, int start, int end) {
475 
476             // An empty string means no work
477             if (start == end) {
478                 return null;
479             }
480 
481             // Avoid construction by analyzing the string
482             // to see if it is already ScriptCase.
483             // Find the first character that is not ScriptCase
484             int first = start;
485             if (isUpperASCII(specification.charAt(start))) {
486                 for (first = start + 1; first < end && isLowerASCII(specification.charAt(first)); ++first) {
487                     continue; // keep going
488                 }
489 
490                 // If we get to the end of the string then it is ScriptCase
491                 if (first == end) {
492                     return specification.substring(start, end);
493                 }
494             }
495 
496             // Bummer, we need to do work.
497             int len = end - start;
498             char[] buf = new char[len];
499             buf[0] = first == start ? toUpperASCII(specification.charAt(first)) : specification.charAt(first);
500             int i = 1;
501             for (int j = start + 1; j < end; ++j) {
502                 buf[i++] = j < first ? specification.charAt(j) : toLowerASCII(specification.charAt(j));
503             }
504             return new String(buf);
505         }
506 
507         /**
508          * Determine whether the character is one of A-Z.
509          * 
510          * @param c the character to examine
511          * @return true if it is in A-Z
512          */
513         private static boolean isUpperASCII(char c) {
514             return c >= 'A' && c <= 'Z';
515         }
516 
517         /**
518          * Determine whether the character is one of a-z.
519          * 
520          * @param c the character to examine
521          * @return true if it is in a-z
522          */
523         private static boolean isLowerASCII(char c) {
524             return c >= 'a' && c <= 'z';
525         }
526 
527         /**
528          * Convert a character, in in a-z to its upper case value, otherwise leave it alone.
529          * 
530          * @param c the character to convert, if in a-z
531          * @return the upper case ASCII representation of the character or the character itself.
532          */
533         private static char toUpperASCII(char c) {
534             return isLowerASCII(c) ? (char) (c - 32) : c;
535         }
536 
537         /**
538          * Convert a character, in in A-Z to its lower case value, otherwise leave it alone.
539          * 
540          * @param c the character to convert, if in A-Z
541          * @return the lower case ASCII representation of the character or the character itself.
542          */
543         private static char toLowerASCII(char c) {
544             return isUpperASCII(c) ? (char) (c + 32) : c;
545         }
546     }
547 
548     /**
549      * The original specification provided by the user.
550      */
551     private String given;
552     /**
553      * The effective specification.
554      */
555     private String found;
556     /**
557      * The lower case iso639 language code. 
558      */
559     private String code;
560     /**
561      * The Title case iso15924 script code.
562      */
563     private String script;
564     /**
565      * The UPPER case iso3166 country code. 
566      */
567     private String country;
568     /**
569      * The name as defined by Languages. 
570      */
571     private String name;
572     /**
573      * Flag to store whether the code is valid.
574      */
575     private boolean valid;
576     private boolean knowsDirection;
577     private boolean ltor;
578 }
579