Coverage Report - org.crosswire.common.util.Language
 
Classes in this File Line Coverage Branch Coverage Complexity
Language
0%
0/122
0%
0/88
4.174
Language$CanonicalUtils
0%
0/42
0%
0/52
4.174
 
 1  
 /**
 2  
  * Distribution License:
 3  
  * JSword is free software; you can redistribute it and/or modify it under
 4  
  * the terms of the GNU Lesser General Public License, version 2.1 or later
 5  
  * as published by the Free Software Foundation. This program is distributed
 6  
  * in the hope that it will be useful, but WITHOUT ANY WARRANTY; without even
 7  
  * the implied warranty of MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE.
 8  
  * See the GNU Lesser General Public License for more details.
 9  
  *
 10  
  * The License is available on the internet at:
 11  
  *      http://www.gnu.org/copyleft/lgpl.html
 12  
  * or by writing to:
 13  
  *      Free Software Foundation, Inc.
 14  
  *      59 Temple Place - Suite 330
 15  
  *      Boston, MA 02111-1307, USA
 16  
  *
 17  
  * © CrossWire Bible Society, 2007 - 2016
 18  
  *
 19  
  */
 20  
 package org.crosswire.common.util;
 21  
 
 22  
 import java.util.Locale;
 23  
 
 24  
 /**
 25  
  * An immutable Language by specification. The specifier consists of up to three parts:
 26  
  * <ul>
 27  
  * <li>LL - An iso639-2 or iso639-3 language code</li>
 28  
  * <li>SSSS - A 4-letter iso15924 script code</li>
 29  
  * <li>CC - A 2-letter iso3166 country code</li>
 30  
  * </ul>
 31  
  * Note: This is a subset of the BCP-47 standard.
 32  
  * 
 33  
  * @see gnu.lgpl.License The GNU Lesser General Public License for details.
 34  
  * @author DM Smith
 35  
  */
 36  0
 public class Language implements Comparable<Language> {
 37  
     /**
 38  
      * The default language code is en for English.
 39  
      */
 40  
     public static final String DEFAULT_LANG_CODE = "en";
 41  
 
 42  
     /**
 43  
      * The language code for invalid language specifications is und for Undetermined.
 44  
      */
 45  
     public static final String UNKNOWN_LANG_CODE = "und";
 46  
 
 47  
     /**
 48  
      * The default language is English.
 49  
      */
 50  0
     public static final Language DEFAULT_LANG = new Language(DEFAULT_LANG_CODE);
 51  
 
 52  
 
 53  
     /**
 54  
      * A single language defined by an ISO-639 code. If the code is null or
 55  
      * empty then it is considered to be DEFAULT_LANG (that is, English).
 56  
      * 
 57  
      * @param specification
 58  
      *            the specifier for the particular language
 59  
      */
 60  0
     public Language(String specification) {
 61  0
         given = specification;
 62  0
         parse(given);
 63  0
     }
 64  
 
 65  
     /**
 66  
      * The specification that was given might not be be the one that
 67  
      * ultimately gets the name.
 68  
      * 
 69  
      * @return the specification that was originally given.
 70  
      */
 71  
     public String getGivenSpecification() {
 72  0
         return given;
 73  
     }
 74  
 
 75  
     /**
 76  
      * The specification that was given might not be be the one that
 77  
      * ultimately gets the name.
 78  
      * 
 79  
      * @return the specification that was used to find the name.
 80  
      */
 81  
     public String getFoundSpecification() {
 82  0
         getName();
 83  0
         return found;
 84  
     }
 85  
 
 86  
     /**
 87  
      * Determine whether this language is valid.
 88  
      * <ul>
 89  
      * <li>LL - An iso639-2 or iso639-3 language code</li>
 90  
      * <li>SSSS - A 4-letter iso15924 script code</li>
 91  
      * <li>CC - A 2-letter iso3166 country code</li>
 92  
      * </ul>
 93  
      * 
 94  
      * @return true if the language is valid.
 95  
      */
 96  
     public boolean isValidLanguage() {
 97  0
         getName();
 98  0
         return valid;
 99  
     }
 100  
 
 101  
     /**
 102  
      * Get the iso639 language code.
 103  
      * 
 104  
      * @return the code for the language in lower case.
 105  
      */
 106  
     public String getCode() {
 107  0
         return code;
 108  
     }
 109  
 
 110  
     /**
 111  
      * Get the iso15924 script for the language. May be null.
 112  
      * 
 113  
      * @return the code for the script in Title case.
 114  
      */
 115  
     public String getScript() {
 116  0
         return script;
 117  
     }
 118  
 
 119  
     /**
 120  
      * Get the iso3166 script for the language. May be null.
 121  
      * 
 122  
      * @return the code for the country in UPPER case.
 123  
      */
 124  
     public String getCountry() {
 125  0
         return country;
 126  
     }
 127  
 
 128  
     /**
 129  
      * Get the localized language name.
 130  
      * 
 131  
      * @return the name of the language
 132  
      */
 133  
     public String getName() {
 134  
         // Note: This is not quite thread safe. Unless name is volatile.
 135  
         // But it will just do the work multiple times.
 136  0
         if (name == null) {
 137  0
             boolean more = true;
 138  
             // Code is the ultimate fallback
 139  0
             String result = code;
 140  0
             String lookup = code;
 141  
 
 142  0
             StringBuilder sb = new StringBuilder();
 143  
             // The lookup is as follows.
 144  
             // There is always a code
 145  
             // If all parts are specified then use that
 146  0
             if (script != null && country != null) {
 147  0
                 sb.append(code);
 148  0
                 sb.append('-');
 149  0
                 sb.append(script);
 150  0
                 sb.append('-');
 151  0
                 sb.append(country);
 152  0
                 lookup = sb.toString();
 153  0
                 result = Languages.getName(lookup);
 154  0
                 more = lookup.equals(result);
 155  
             }
 156  
 
 157  
             // If script is specified it has precedence over country
 158  0
             if (more && script != null) {
 159  0
                 sb.setLength(0);
 160  0
                 sb.append(code);
 161  0
                 sb.append('-');
 162  0
                 sb.append(script);
 163  0
                 lookup = sb.toString();
 164  0
                 result = Languages.getName(lookup);
 165  0
                 more = lookup.equals(result);
 166  
             }
 167  
 
 168  
             // If country was specified, check for that now.
 169  0
             if (more && country != null) {
 170  0
                 sb.setLength(0);
 171  0
                 sb.append(code);
 172  0
                 sb.append('-');
 173  0
                 sb.append(country);
 174  0
                 lookup = sb.toString();
 175  0
                 result = Languages.getName(lookup);
 176  0
                 more = lookup.equals(result);
 177  
             }
 178  
 
 179  
             // Now check just the code.
 180  0
             if (more) {
 181  0
                 lookup = code;
 182  0
                 result = Languages.getName(lookup);
 183  0
                 more = lookup.equals(result);
 184  
             }
 185  
 
 186  
             // Oops, the localized lookup failed.
 187  
             // See if Java has one.
 188  0
             if (more) {
 189  0
                 lookup = code;
 190  0
                 result = new Locale(lookup).getDisplayLanguage();
 191  0
                 more = lookup.equals(result);
 192  
             }
 193  
 
 194  
             // Oops, Java doesn't have a clue
 195  
             // Look into our heavy handed listing
 196  0
             if (more) {
 197  0
                 lookup = code;
 198  0
                 result = Languages.AllLanguages.getName(lookup);
 199  0
                 more = lookup.equals(result);
 200  
             }
 201  
 
 202  
             // Oops, didn't find it anywhere. Mark it as invalid.
 203  0
             if (more) {
 204  0
                 valid = false;
 205  
             }
 206  
             // now that we are here go with what we last used and got
 207  0
             found = lookup;
 208  
             // Assign name last to help with synchronization issues
 209  0
             name = result;
 210  
         }
 211  0
         return name;
 212  
     }
 213  
 
 214  
     /**
 215  
      * Determine whether this language is a Left-to-Right or a Right-to-Left
 216  
      * language. If the language has a script, it is used for the determination.
 217  
      * Otherwise, check the language.
 218  
      * <p>
 219  
      * Note: This is problematic. Languages do not have direction.
 220  
      * Scripts do. Further, there are over 7000 living languages, many of which
 221  
      * are written in Right-to-Left scripts and are not listed here.
 222  
      * </p>
 223  
      * 
 224  
      * @return true if the language is Left-to-Right.
 225  
      */
 226  
     public boolean isLeftToRight() {
 227  0
         if (!knowsDirection) {
 228  0
             ltor = !Languages.RtoL.isRtoL(script, code);
 229  0
             knowsDirection = true;
 230  
         }
 231  0
         return ltor;
 232  
     }
 233  
 
 234  
     /* (non-Javadoc)
 235  
      * @see java.lang.Object#hashCode()
 236  
      */
 237  
     @Override
 238  
     public int hashCode() {
 239  0
         if (found == null) {
 240  0
             getName();
 241  
         }
 242  0
         return found.hashCode();
 243  
     }
 244  
 
 245  
     /* (non-Javadoc)
 246  
      * @see java.lang.Object#equals(java.lang.Object)
 247  
      */
 248  
     @Override
 249  
     public boolean equals(Object obj) {
 250  0
         if (this == obj) {
 251  0
             return true;
 252  
         }
 253  
 
 254  0
         if (obj == null || getClass() != obj.getClass()) {
 255  0
             return false;
 256  
         }
 257  
 
 258  0
         final Language other = (Language) obj;
 259  
 
 260  0
         return code.equals(other.code)  && compareStrings(script, other.script) && compareStrings(country, other.country);
 261  
     }
 262  
 
 263  
     /* (non-Javadoc)
 264  
      * @see java.lang.Object#toString()
 265  
      */
 266  
     @Override
 267  
     public String toString() {
 268  0
         return getName();
 269  
     }
 270  
 
 271  
     /* (non-Javadoc)
 272  
      * @see java.lang.Comparable#compareTo(java.lang.Object)
 273  
      */
 274  
     public int compareTo(Language o) {
 275  0
         return getName().compareTo(o.getName());
 276  
     }
 277  
 
 278  
     /**
 279  
      * Split the specification on '-' into 1 to 3 parts.
 280  
      * 
 281  
      * @param spec the specification to parse
 282  
      */
 283  
     private void parse(String spec) {
 284  0
         String specification = spec;
 285  0
         if (specification == null) {
 286  0
             specification = DEFAULT_LANG_CODE;
 287  
         }
 288  
 
 289  0
         int len = specification.length();
 290  
 
 291  
         // It used to be that SWORD modules used x- and X- as a language prefix
 292  
         // for minority languages. Now that we have a complete iso639 spec,
 293  
         // SWORD does not use it.
 294  0
         if (len < 2 || specification.charAt(0) == '-' || specification.charAt(1) == '-') {
 295  0
             valid = false;
 296  0
             code = UNKNOWN_LANG_CODE;
 297  0
             return;
 298  
         }
 299  
 
 300  
         // Obvious optimization of the most common case: only the language code is given
 301  0
         if (len <= 3) {
 302  0
             code = CanonicalUtils.getLanguage(specification, 0, len);
 303  
         }
 304  
 
 305  0
         int partLen = 0;
 306  0
         int start = 0;
 307  
         int split;
 308  0
         for (split = 2; split < len; ++split) {
 309  0
             char c = specification.charAt(split);
 310  0
             if (c == '-') {
 311  0
                 break;
 312  
             }
 313  
         }
 314  0
         code = CanonicalUtils.getLanguage(specification, start, split);
 315  0
         partLen = split - start;
 316  0
         valid = partLen == 2 || partLen == 3;
 317  0
         start = split + 1;
 318  
 
 319  
         // Get the second part. It is either a script or a country code
 320  0
         if (split < len) {
 321  0
             for (split = start; split < len; ++split) {
 322  0
                 char c = specification.charAt(split);
 323  0
                 if (c == '-') {
 324  0
                     break;
 325  
                 }
 326  
             }
 327  0
             partLen = split - start;
 328  0
             if (partLen == 4) {
 329  0
                 script = CanonicalUtils.getScript(specification, start, split);
 330  0
             } else if (partLen == 2) {
 331  0
                 country = CanonicalUtils.getCountry(specification, start, split);
 332  
             } else {
 333  0
                 valid = false;
 334  
             }
 335  0
             start = split + 1;
 336  
         }
 337  
 
 338  
         // Get the third part, if any. It can only be a country code.
 339  0
         if (country == null && split < len) {
 340  0
             for (split = start; split < len; ++split) {
 341  0
                 char c = specification.charAt(split);
 342  0
                 if (c == '-') {
 343  0
                     break;
 344  
                 }
 345  
             }
 346  0
             partLen = split - start;
 347  0
             if (partLen == 2) {
 348  0
                 country = CanonicalUtils.getCountry(specification, start, split);
 349  
             } else {
 350  0
                 valid = false;
 351  
             }
 352  0
             start = split + 1;
 353  
         }
 354  
 
 355  0
         if (start <= len) {
 356  0
             valid = false;
 357  
         }
 358  0
     }
 359  
 
 360  
     /**
 361  
      * Equal if both a and b are the same.
 362  
      * 
 363  
      * @param a a string to compare
 364  
      * @param b a string to compare
 365  
      * @return true if both are the same.
 366  
      */
 367  
     private boolean compareStrings(String a, String b) {
 368  0
         return (a == null && b == null) || (a != null && a.equals(b));
 369  
     }
 370  
 
 371  
     /**
 372  
      * Converts substrings to the canonical representation for language code, script and country.
 373  
      */
 374  
     private static final class CanonicalUtils {
 375  
         /**
 376  
          * Utility class. Private constructor.
 377  
          */
 378  0
         private CanonicalUtils() {
 379  0
         }
 380  
 
 381  
         /**
 382  
          * The iso639 language code's canonical form is lower case.
 383  
          * 
 384  
          * @param specification
 385  
          *            the bcp47 specification of the language
 386  
          * @param start
 387  
          *            the start of the code
 388  
          * @param end
 389  
          *            the position of the character following the code
 390  
          * @return the canonical representation for the code
 391  
          */
 392  
         public static String getLanguage(String specification, int start, int end) {
 393  
 
 394  
             // An empty string means no work
 395  0
             if (start == end) {
 396  0
                 return null;
 397  
             }
 398  
 
 399  
             // Avoid construction by analyzing the string
 400  
             // to see if it is already LanguageCase.
 401  
             // Find the first character that is not LanguageCase
 402  
             int first;
 403  0
             for (first = start; first < end && isLowerASCII(specification.charAt(first)); ++first) {
 404  
                 continue; // keep going
 405  
             }
 406  
 
 407  
             // If we get to the end of the string then it is CountryCase
 408  0
             if (first == end) {
 409  0
                 return specification.substring(start, end);
 410  
             }
 411  
 
 412  
             // Bummer, we need to do work
 413  0
             int len = end - start;
 414  0
             char[] buf = new char[len];
 415  0
             int i = 0;
 416  0
             for (int j = start; j < end; ++j) {
 417  0
                 buf[i++] = j < first ? specification.charAt(j) : toLowerASCII(specification.charAt(j));
 418  
             }
 419  0
             return new String(buf);
 420  
         }
 421  
 
 422  
         /**
 423  
          * The iso3166 country code's canonical form is upper case.
 424  
          * 
 425  
          * @param specification
 426  
          *            the bcp47 specification of the language
 427  
          * @param start
 428  
          *            the start of the code
 429  
          * @param end
 430  
          *            the position of the character following the code
 431  
          * @return the canonical representation for the code
 432  
          */
 433  
         public static String getCountry(String specification, int start, int end) {
 434  
 
 435  
             // An empty string means no work
 436  0
             if (start == end) {
 437  0
                 return null;
 438  
             }
 439  
 
 440  
             // Avoid construction by analyzing the string
 441  
             // to see if it is already CountryCase.
 442  
             // Find the first character that is not CountryCase
 443  
             int first;
 444  0
             for (first = start; first < end && isUpperASCII(specification.charAt(first)); ++first) {
 445  
                 continue; // keep going
 446  
             }
 447  
 
 448  
             // If we get to the end of the string then it is CountryCase
 449  0
             if (first == end) {
 450  0
                 return specification.substring(start, end);
 451  
             }
 452  
 
 453  
             // Bummer, we need to do work
 454  0
             int len = end - start;
 455  0
             char[] buf = new char[len];
 456  0
             int i = 0;
 457  0
             for (int j = start; j < end; ++j) {
 458  0
                 buf[i++] = j < first ? specification.charAt(j) : toUpperASCII(specification.charAt(j));
 459  
             }
 460  0
             return new String(buf);
 461  
         }
 462  
 
 463  
         /**
 464  
          * The iso15924 script code's canonical form is title case.
 465  
          * 
 466  
          * @param specification
 467  
          *            the bcp47 specification of the language
 468  
          * @param start
 469  
          *            the start of the code
 470  
          * @param end
 471  
          *            the position of the character following the code
 472  
          * @return the canonical representation for the code
 473  
          */
 474  
         public static String getScript(String specification, int start, int end) {
 475  
 
 476  
             // An empty string means no work
 477  0
             if (start == end) {
 478  0
                 return null;
 479  
             }
 480  
 
 481  
             // Avoid construction by analyzing the string
 482  
             // to see if it is already ScriptCase.
 483  
             // Find the first character that is not ScriptCase
 484  0
             int first = start;
 485  0
             if (isUpperASCII(specification.charAt(start))) {
 486  0
                 for (first = start + 1; first < end && isLowerASCII(specification.charAt(first)); ++first) {
 487  
                     continue; // keep going
 488  
                 }
 489  
 
 490  
                 // If we get to the end of the string then it is ScriptCase
 491  0
                 if (first == end) {
 492  0
                     return specification.substring(start, end);
 493  
                 }
 494  
             }
 495  
 
 496  
             // Bummer, we need to do work.
 497  0
             int len = end - start;
 498  0
             char[] buf = new char[len];
 499  0
             buf[0] = first == start ? toUpperASCII(specification.charAt(first)) : specification.charAt(first);
 500  0
             int i = 1;
 501  0
             for (int j = start + 1; j < end; ++j) {
 502  0
                 buf[i++] = j < first ? specification.charAt(j) : toLowerASCII(specification.charAt(j));
 503  
             }
 504  0
             return new String(buf);
 505  
         }
 506  
 
 507  
         /**
 508  
          * Determine whether the character is one of A-Z.
 509  
          * 
 510  
          * @param c the character to examine
 511  
          * @return true if it is in A-Z
 512  
          */
 513  
         private static boolean isUpperASCII(char c) {
 514  0
             return c >= 'A' && c <= 'Z';
 515  
         }
 516  
 
 517  
         /**
 518  
          * Determine whether the character is one of a-z.
 519  
          * 
 520  
          * @param c the character to examine
 521  
          * @return true if it is in a-z
 522  
          */
 523  
         private static boolean isLowerASCII(char c) {
 524  0
             return c >= 'a' && c <= 'z';
 525  
         }
 526  
 
 527  
         /**
 528  
          * Convert a character, in in a-z to its upper case value, otherwise leave it alone.
 529  
          * 
 530  
          * @param c the character to convert, if in a-z
 531  
          * @return the upper case ASCII representation of the character or the character itself.
 532  
          */
 533  
         private static char toUpperASCII(char c) {
 534  0
             return isLowerASCII(c) ? (char) (c - 32) : c;
 535  
         }
 536  
 
 537  
         /**
 538  
          * Convert a character, in in A-Z to its lower case value, otherwise leave it alone.
 539  
          * 
 540  
          * @param c the character to convert, if in A-Z
 541  
          * @return the lower case ASCII representation of the character or the character itself.
 542  
          */
 543  
         private static char toLowerASCII(char c) {
 544  0
             return isUpperASCII(c) ? (char) (c + 32) : c;
 545  
         }
 546  
     }
 547  
 
 548  
     /**
 549  
      * The original specification provided by the user.
 550  
      */
 551  
     private String given;
 552  
     /**
 553  
      * The effective specification.
 554  
      */
 555  
     private String found;
 556  
     /**
 557  
      * The lower case iso639 language code. 
 558  
      */
 559  
     private String code;
 560  
     /**
 561  
      * The Title case iso15924 script code.
 562  
      */
 563  
     private String script;
 564  
     /**
 565  
      * The UPPER case iso3166 country code. 
 566  
      */
 567  
     private String country;
 568  
     /**
 569  
      * The name as defined by Languages. 
 570  
      */
 571  
     private String name;
 572  
     /**
 573  
      * Flag to store whether the code is valid.
 574  
      */
 575  
     private boolean valid;
 576  
     private boolean knowsDirection;
 577  
     private boolean ltor;
 578  
 }