diff --git a/build.gradle b/build.gradle index a9bbf127..f5598de1 100644 --- a/build.gradle +++ b/build.gradle @@ -32,8 +32,18 @@ dependencies { testCompile 'junit:junit:4.11' } -test { - systemProperty 'test_with_php', "$System.env.TEST_WITH_PHP" +javadoc { + options.docEncoding 'utf-8' + options.charSet 'utf-8' + options.windowTitle 'Kana Tools for Java SDK' + options.noNavBar true + options.noHelp true + options.noDeprecated true + options.noDeprecatedList true + options.noQualifiers 'all' + options.memberLevel = JavadocMemberLevel.PUBLIC + options.links 'https://docs.oracle.com/javase/8/docs/api/' + options.stylesheetFile = new File(projectDir, "docs/kanatools-javadocs.css"); } jacoco { diff --git a/docs/README.md b/docs/README.md new file mode 100644 index 00000000..fe45e941 --- /dev/null +++ b/docs/README.md @@ -0,0 +1,8 @@ +Kana Tools for Java - Documentation +=================================== + +#### Website +* [Online documentation](http://mariten.github.io/kanatools-java/) + +#### Code +* [See the `gh-pages` branch of this repo](https://github.com/mariten/kanatools-java/tree/gh-pages) diff --git a/docs/kanatools-javadocs.css b/docs/kanatools-javadocs.css new file mode 100644 index 00000000..ad7887e4 --- /dev/null +++ b/docs/kanatools-javadocs.css @@ -0,0 +1,488 @@ +body { + color: #333; + font-family: 'Helvetica Neue', Calibri, sans-serif; + margin: 0; +} +hr { + display: none; +} +caption { + display: none; +} +a:link, +a:visited { + text-decoration: none; + color: rgb(26, 116, 186); +} +a:hover, +a:focus { + text-decoration: none; + color: rgb(22, 100, 160); +} +a:active { + text-decoration: none; + color: #4c6b87; +} +a[name] { + color: #353833; +} +a[name]:hover { + text-decoration: none; + color: #353833; +} +pre { + font-family: Menlo, Consolas, monospace; + font-size: 1em; +} +code { + font-family: Menlo, Consolas, monospace; + font-size: 1em; +} +h1 { + font-size: 2em; +} +h2 { + font-size: 1.7em; +} +h3 { + font-size: 1.5em; +} +h4 { + font-size: 1.2em; +} +h5 { + font-size: 1.1em; +} +h6 { + font-size: 1.1em; +} +ul { + list-style-type: disc; +} +table tr td dt code { + vertical-align: top; +} +sup { + font-size: .6em; +} +h3 a:link, +h3 a:visited { + text-decoration: none; + color: rgb(153, 153, 153); +} +h3 a:hover, +h3 a:focus { + text-decoration: none; + color: rgb(169, 169, 169); +} +tbody:first-of-type tr:nth-child(odd) { + background: #f3f3f3 +} +tbody:nth-of-type(2) tr:nth-child(even) { + background: #f3f3f3 +} +tbody:first-of-type tr:first-child { + font-size: 1.2em +} +.constantValuesContainer h2 { + margin: 0 0 0 .5882em; +} +.details { + margin: 1em; +} +.clear { + clear: both; + height: 0px; + overflow: hidden; +} +.aboutLanguage { + float: right; + padding: 0px 21px; + font-size: .8em; + z-index: 200; + margin-top: -7px; +} +.legalCopy { + margin-left: .5em; +} +.bar a, +.bar a:link, +.bar a:visited, +.bar a:active { + color: #FFFFFF; + text-decoration: none; +} +.bar a:hover, +.bar a:focus { + color: #bb7a2a; +} +.tab { + background-color: #0066FF; + background-image: url(resources/titlebar.gif); + background-position: left top; + background-repeat: no-repeat; + color: #ffffff; + padding: 8px; + width: 5em; + font-weight: bold; +} +.bar { + display: none; +} +ul.navList, +ul.subNavList { + float: left; + margin: 0 25px 0 0; + padding: 0; +} +ul.navList li { + list-style: none; + float: left; + padding: 3px 6px; +} +ul.subNavList li { + list-style: none; + float: left; + font-size: 90%; +} +.topNav a:link, +.topNav a:active, +.topNav a:visited, +.bottomNav a:link, +.bottomNav a:active, +.bottomNav a:visited { + color: rgb(153, 153, 153); + text-decoration: none; +} +.topNav a:hover, +.bottomNav a:hover { + text-decoration: none; + color: #fff; +} +.navBarCell1Rev { + font-weight: bold; +} +.header, +.footer { + clear: both; + margin: 1em; +} +.indexHeader { + margin: 10px; + position: relative; +} +.indexHeader h1 { + font-size: 1.3em; +} +.subTitle { + display: none; +} +.header ul { + margin: 0 0 25px 0; + padding: 0; +} +.footer ul { + margin: 20px 0 5px 0; +} +.header ul li, +.footer ul li { + list-style: none; + font-size: 1.2em; +} +.header .docSummary ~ p { + display: none; +} +ul.blockList ul.blockList ul.blockList li.blockList h3 { + background-color: rgb(64, 64, 64); + color: #fff; + font-size: 1.2em; + padding: .5em 1em .5em 1em; +} +ul.blockList ul.blockList li.blockList h3 { + display: none; +} +ul.blockList li.blockList h2 { + padding: 0px 0 20px 0; +} +div.summary ul.blockList li.blockList ul.blockList li.blockList ul.blockList code { + display: block; + margin: 1em; +} +div.summary ul.blockList li.blockList ul.blockList li.blockList ul.blockList li.blockList h3 { + display: block; +} +.contentContainer, +.sourceContainer, +.classUseContainer, +.serializedFormContainer, +.constantValuesContainer { + clear: both; + position: relative; +} +.indexContainer { + margin: 10px; + position: relative; + font-size: 1.0em; +} +.indexContainer h2 { + font-size: 1.1em; + padding: 0 0 3px 0; +} +.indexContainer ul { + margin: 0; + padding: 0; +} +.indexContainer ul li { + list-style: none; +} +.contentContainer .description dl dt, +.contentContainer .details dl dt, +.serializedFormContainer dl dt { + font-size: 1.1em; + font-weight: bold; + margin: 10px 0 0 0; + color: #4E4E4E; +} +.contentContainer .description dl dd, +.contentContainer .details dl dd, +.serializedFormContainer dl dd { + margin: 10px 0 10px 20px; +} +.serializedFormContainer dl.nameValue dt { + margin-left: 1px; + font-size: 1.1em; + display: inline; + font-weight: bold; +} +.serializedFormContainer dl.nameValue dd { + margin: 0 0 0 1px; + font-size: 1.1em; + display: inline; +} +ul.horizontal li { + display: inline; + font-size: 0.9em; +} +ul.inheritance { + margin-left: 1em; + padding: 0; +} +ul.inheritance li { + display: inline; + list-style: none; +} +ul.inheritance li ul.inheritance { + margin-left: 15px; + padding-left: 15px; + padding-top: 1px; +} +ul.blockList, +ul.blockListLast { + margin: 1em 0 1em 0; + padding: 0; +} +ul.blockList h4, +ul.blockListLast h4 { + background-color: rgb(64, 64, 64); + color: #fff; + margin: 0 -.8333em 0 -.8333em; + padding: .5em 1em .5em 1em; +} +ul.blockList li.blockList, +ul.blockListLast li.blockList { + list-style: none; + margin-bottom: 25px; +} +ul.blockList ul.blockList ul.blockList ul.blockList li.blockList { + margin-left: 0; + padding-left: 0; + padding-bottom: 15px; + border: none; + border-bottom: 1px solid #9eadc0; +} +ul.blockList ul.blockList ul.blockList ul.blockList li.blockListLast { + list-style: none; + border-bottom: none; + padding-bottom: 0; +} +table tr td dl, +table tr td dl dt, +table tr td dl dd { + margin-top: 0; + margin-bottom: 1px; +} +.contentContainer table, +.classUseContainer table, +.constantValuesContainer table { + width: 100%; +} +.contentContainer ul li table, +.classUseContainer ul li table, +.constantValuesContainer ul li table { + width: 100%; +} +.contentContainer .description table, +.contentContainer .details table { + border-bottom: none; +} +.contentContainer ul li table th.colOne, +.contentContainer ul li table th.colFirst, +.contentContainer ul li table th.colLast, +.classUseContainer ul li table th, +.constantValuesContainer ul li table th, +.contentContainer ul li table td.colOne, +.contentContainer ul li table td.colFirst, +.contentContainer ul li table td.colLast, +.classUseContainer ul li table td, +.constantValuesContainer ul li table td { + vertical-align: top; +} +a[name="package_description"] ~ * { + display: none; +} +.overviewSummary caption, +.packageSummary caption, +.contentContainer ul.blockList li.blockList caption, +.summary caption, +.classUseContainer caption, +.constantValuesContainer caption { + position: relative; + text-align: left; + background-repeat: no-repeat; + color: #FFFFFF; + font-weight: bold; + clear: none; + overflow: hidden; + padding: 0px; + margin: 0px; +} +caption a:link, +caption a:hover, +caption a:active, +caption a:visited { + color: #FFFFFF; +} +.overviewSummary caption span, +.packageSummary caption span, +.contentContainer ul.blockList li.blockList caption span, +.summary caption span, +.classUseContainer caption span, +.constantValuesContainer caption span { + white-space: nowrap; + padding-top: 8px; + padding-left: 8px; + display: block; + float: left; + background-image: url(resources/titlebar.gif); + height: 18px; +} +.overviewSummary .tabEnd, +.packageSummary .tabEnd, +.contentContainer ul.blockList li.blockList .tabEnd, +.summary .tabEnd, +.classUseContainer .tabEnd, +.constantValuesContainer .tabEnd { + width: 10px; + background-image: url(resources/titlebar_end.gif); + background-repeat: no-repeat; + background-position: top right; + position: relative; + float: left; +} +ul.blockList ul.blockList li.blockList table { + width: 100%; +} +.tableSubHeadingColor { + background-color: #EEEEFF; +} +.rowColor { + background-color: #ffffff; +} +.overviewSummary td, +.packageSummary td, +.contentContainer ul.blockList li.blockList td, +.summary td, +.classUseContainer td, +.constantValuesContainer td { + text-align: left; + padding: .5em 1em .5em 1em; +} +th.colFirst, +th.colLast, +th.colOne, +.constantValuesContainer th { + background: rgb(64, 64, 64); + color: #fff; + text-align: left; + padding: .5em 1em .5em 1em; +} +td.colOne a:link, +td.colOne a:active, +td.colOne a:visited, +td.colOne a:hover, +td.colFirst a:link, +td.colFirst a:active, +td.colFirst a:visited, +td.colFirst a:hover, +td.colLast a:link, +td.colLast a:active, +td.colLast a:visited, +td.colLast a:hover, +.constantValuesContainer td a:link, +.constantValuesContainer td a:active, +.constantValuesContainer td a:visited, +.constantValuesContainer td a:hover { + font-weight: bold; +} +td.colFirst, +th.colFirst { + white-space: nowrap; +} +table.overviewSummary { + padding: 0px; + margin-left: 0px; +} +table.overviewSummary td.colFirst, +table.overviewSummary th.colFirst, +table.overviewSummary td.colOne, +table.overviewSummary th.colOne { + width: 25%; + vertical-align: middle; +} +table.packageSummary td.colFirst, +table.overviewSummary th.colFirst { + width: 25%; + vertical-align: middle; +} +div.description { + margin: 1em; +} +.description pre { + margin-top: 0; +} +.description .block { + margin: 2em 0 2em 0; +} +.deprecatedContent { + margin: 0; + padding: 10px 0; +} +.docSummary { + padding: 0; +} +.sourceLineNo { + color: green; + padding: 0 30px 0 0; +} +h1.hidden { + visibility: hidden; + overflow: hidden; + font-size: .9em; +} +.block { + display: block; + margin: 3px 0 0 0; +} +.strong { + font-weight: bold; +} diff --git a/src/main/java/com/mariten/kanatools/KanaAppraiser.java b/src/main/java/com/mariten/kanatools/KanaAppraiser.java index 8c251535..1f459e7e 100644 --- a/src/main/java/com/mariten/kanatools/KanaAppraiser.java +++ b/src/main/java/com/mariten/kanatools/KanaAppraiser.java @@ -2,69 +2,88 @@ /** * Confirms whether a given character is amongst certain types of Japanese text or not. + * + *

Supports checking whether a character is amongst the following types of text:

+ * + *

All input characters are expected to be ASCII or UTF-8 encoding

+ * + * @author Jeff Case (mariten) */ public class KanaAppraiser { - // Character set lower/upper bound definitions + //{{{ Character set lower/upper bound definitions //// Bounds for Hiragana - public static final char ZENKAKU_HIRAGANA_FIRST = 'ぁ'; // U+3041 - public static final char ZENKAKU_HIRAGANA_LAST_FOR_CONVERT = 'ん'; // U+3093 - public static final char ZENKAKU_HIRAGANA_LAST = 'ゖ'; // U+3096 + /** U+3041 */ public static final char ZENKAKU_HIRAGANA_FIRST = 'ぁ'; + /** U+3093 */ public static final char ZENKAKU_HIRAGANA_LAST_FOR_CONVERT = 'ん'; + /** U+3096 */ public static final char ZENKAKU_HIRAGANA_LAST = 'ゖ'; //// Bounds for Katakana - public static final char HANKAKU_KATAKANA_FIRST = 'ヲ'; // U+FF66 - public static final char HANKAKU_KATAKANA_LAST = 'ン'; // U+FF9D + /** U+FF66 */ public static final char HANKAKU_KATAKANA_FIRST = 'ヲ'; + /** U+FF9D */ public static final char HANKAKU_KATAKANA_LAST = 'ン'; - public static final char ZENKAKU_KATAKANA_FIRST = 'ァ'; // U+30A1 - public static final char ZENKAKU_KATAKANA_LAST_FOR_CONVERT = 'ン'; // U+30F3 - public static final char ZENKAKU_KATAKANA_LAST = 'ヺ'; // U+30FA + /** U+30A1 */ public static final char ZENKAKU_KATAKANA_FIRST = 'ァ'; + /** U+30F3 */ public static final char ZENKAKU_KATAKANA_LAST_FOR_CONVERT = 'ン'; + /** U+30FA */ public static final char ZENKAKU_KATAKANA_LAST = 'ヺ'; //// Bounds for Punctuation (kutoten) - public static final char HANKAKU_PUNCTUATION_FIRST = '。'; // U+FF61 - public static final char HANKAKU_PUNCTUATION_LAST = '゚'; // U+FF9F - public static final char HANKAKU_PUNCTUATION_ONBIKI = 'ー'; // U+FF70 + /** U+FF61 */ public static final char HANKAKU_PUNCTUATION_FIRST = '。'; + /** U+FF9F */ public static final char HANKAKU_PUNCTUATION_LAST = '゚'; + /** U+FF70 */ public static final char HANKAKU_PUNCTUATION_ONBIKI = 'ー'; - public static final char ZENKAKU_PUNCTUATION_FIRST = '、'; // U+3001 - public static final char ZENKAKU_PUNCTUATION_LAST = '〜'; // U+301C - public static final char ZENKAKU_PUNCTUATION_HG_FIRST = '゛'; // U+309B - public static final char ZENKAKU_PUNCTUATION_HG_LAST = 'ゞ'; // U+309E - public static final char ZENKAKU_PUNCTUATION_KK_FIRST = '・'; // U+30FB - public static final char ZENKAKU_PUNCTUATION_KK_LAST = 'ヾ'; // U+30FE + /** U+3001 */ public static final char ZENKAKU_PUNCTUATION_FIRST = '、'; + /** U+301C */ public static final char ZENKAKU_PUNCTUATION_LAST = '〜'; + /** U+309B */ public static final char ZENKAKU_PUNCTUATION_HG_FIRST = '゛'; + /** U+309E */ public static final char ZENKAKU_PUNCTUATION_HG_LAST = 'ゞ'; + /** U+30FB */ public static final char ZENKAKU_PUNCTUATION_KK_FIRST = '・'; + /** U+30FE */ public static final char ZENKAKU_PUNCTUATION_KK_LAST = 'ヾ'; //// Bounds for Numeric - public static final char HANKAKU_NUMBER_FIRST = '0'; // U+0030 - public static final char HANKAKU_NUMBER_LAST = '9'; // U+0039 + /** U+0030 */ public static final char HANKAKU_NUMBER_FIRST = '0'; + /** U+0039 */ public static final char HANKAKU_NUMBER_LAST = '9'; - public static final char ZENKAKU_NUMBER_FIRST = '0'; // U+FF10 - public static final char ZENKAKU_NUMBER_LAST = '9'; // U+FF19 + /** U+FF10 */ public static final char ZENKAKU_NUMBER_FIRST = '0'; + /** U+FF19 */ public static final char ZENKAKU_NUMBER_LAST = '9'; //// Bounds for Alphabetic - public static final char HANKAKU_LETTER_UPPER_FIRST = 'A'; // U+0041 - public static final char HANKAKU_LETTER_UPPER_LAST = 'Z'; // U+005A - public static final char HANKAKU_LETTER_LOWER_FIRST = 'a'; // U+0061 - public static final char HANKAKU_LETTER_LOWER_LAST = 'z'; // U+007A + /** U+0041 */ public static final char HANKAKU_LETTER_UPPER_FIRST = 'A'; + /** U+005A */ public static final char HANKAKU_LETTER_UPPER_LAST = 'Z'; + /** U+0061 */ public static final char HANKAKU_LETTER_LOWER_FIRST = 'a'; + /** U+007A */ public static final char HANKAKU_LETTER_LOWER_LAST = 'z'; - public static final char ZENKAKU_LETTER_UPPER_FIRST = 'A'; // U+FF21 - public static final char ZENKAKU_LETTER_UPPER_LAST = 'Z'; // U+FF3A - public static final char ZENKAKU_LETTER_LOWER_FIRST = 'a'; // U+FF41 - public static final char ZENKAKU_LETTER_LOWER_LAST = 'z'; // U+FF5A + /** U+FF21 */ public static final char ZENKAKU_LETTER_UPPER_FIRST = 'A'; + /** U+FF3A */ public static final char ZENKAKU_LETTER_UPPER_LAST = 'Z'; + /** U+FF41 */ public static final char ZENKAKU_LETTER_LOWER_FIRST = 'a'; + /** U+FF5A */ public static final char ZENKAKU_LETTER_LOWER_LAST = 'z'; // Bounds for All Alphanumeric and Symbol ASCII - public static final char HANKAKU_SPACE = ' '; // U+0020 - public static final char HANKAKU_ASCII_FIRST = '!'; // U+0021 - public static final char HANKAKU_ASCII_LAST = '~'; // U+007E + /** U+0020 */ public static final char HANKAKU_SPACE = ' '; + /** U+0021 */ public static final char HANKAKU_ASCII_FIRST = '!'; + /** U+007E */ public static final char HANKAKU_ASCII_LAST = '~'; - public static final char ZENKAKU_SPACE = ' '; // U+3000 - public static final char ZENKAKU_ASCII_FIRST = '!'; // U+FF01 - public static final char ZENKAKU_ASCII_LAST = '~'; // U+FF5E + /** U+3000 */ public static final char ZENKAKU_SPACE = ' '; + /** U+FF01 */ public static final char ZENKAKU_ASCII_FIRST = '!'; + /** U+FF5E */ public static final char ZENKAKU_ASCII_LAST = '~'; + //}}} //{{{ boolean isZenkakuHiragana(char) + /** + * Checks whether a character is full-width (zenkaku) hiragana. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isZenkakuHiragana(char eval_char) { if(eval_char >= ZENKAKU_HIRAGANA_FIRST @@ -77,6 +96,12 @@ public static boolean isZenkakuHiragana(char eval_char) //{{{ boolean isZenkakuHiraganaWithKatakanaEquivalent(char) + /** + * Checks whether a character is full-width (zenkaku) hiragana and that it has a canonical one-to-one katakana equivalent. + * @since 1.3.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isZenkakuHiraganaWithKatakanaEquivalent(char eval_char) { if(eval_char >= ZENKAKU_HIRAGANA_FIRST @@ -89,6 +114,12 @@ public static boolean isZenkakuHiraganaWithKatakanaEquivalent(char eval_char) //{{{ boolean isHankakuKatakana(char) + /** + * Checks whether a character is half-width (hankaku) katakana. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isHankakuKatakana(char eval_char) { if(eval_char >= HANKAKU_KATAKANA_FIRST @@ -102,6 +133,12 @@ public static boolean isHankakuKatakana(char eval_char) //{{{ boolean isZenkakuKatakana(char) + /** + * Checks whether a character is full-width (zenkaku) katakana. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isZenkakuKatakana(char eval_char) { if(eval_char >= ZENKAKU_KATAKANA_FIRST @@ -114,6 +151,12 @@ public static boolean isZenkakuKatakana(char eval_char) //{{{ boolean isZenkakuKatakanaWithHiraganaEquivalent(char) + /** + * Checks whether a character is full-width (zenkaku) katakana and that it has a canonical one-to-one hiragana equivalent. + * @since 1.3.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isZenkakuKatakanaWithHiraganaEquivalent(char eval_char) { if(eval_char >= ZENKAKU_KATAKANA_FIRST @@ -126,6 +169,12 @@ public static boolean isZenkakuKatakanaWithHiraganaEquivalent(char eval_char) //{{{ boolean isHankakuKutoten(char) + /** + * Checks whether a character is half-width (hankaku) Japanese punctuation (kutoten). + * @since 1.3.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isHankakuKutoten(char eval_char) { if(eval_char >= HANKAKU_PUNCTUATION_FIRST @@ -139,6 +188,12 @@ public static boolean isHankakuKutoten(char eval_char) //{{{ boolean isZenkakuKutoten(char) + /** + * Checks whether a character is full-width (zenkaku) Japanese punctuation (kutoten). + * @since 1.3.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isZenkakuKutoten(char eval_char) { if((eval_char >= ZENKAKU_PUNCTUATION_FIRST && eval_char <= ZENKAKU_PUNCTUATION_LAST) @@ -152,6 +207,12 @@ public static boolean isZenkakuKutoten(char eval_char) //{{{ boolean isHankakuNumber(char) + /** + * Checks whether a character is a standard-width (hankaku) number. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isHankakuNumber(char eval_char) { if(eval_char >= HANKAKU_NUMBER_FIRST @@ -164,6 +225,12 @@ public static boolean isHankakuNumber(char eval_char) //{{{ boolean isZenkakuNumber(char) + /** + * Checks whether a character is a double-width (zenkaku) number. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isZenkakuNumber(char eval_char) { if(eval_char >= ZENKAKU_NUMBER_FIRST @@ -176,6 +243,12 @@ public static boolean isZenkakuNumber(char eval_char) //{{{ boolean isHankakuLetter(char) + /** + * Checks whether a character is a standard-width (hankaku) alphabetic letter. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isHankakuLetter(char eval_char) { if(eval_char >= HANKAKU_LETTER_UPPER_FIRST @@ -192,6 +265,12 @@ public static boolean isHankakuLetter(char eval_char) //{{{ boolean isZenkakuLetter(char) + /** + * Checks whether a character is a double-width (zenkaku) alphabetic letter. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isZenkakuLetter(char eval_char) { if(eval_char >= ZENKAKU_LETTER_UPPER_FIRST @@ -208,6 +287,12 @@ public static boolean isZenkakuLetter(char eval_char) //{{{ boolean isHankakuAscii(char) + /** + * Checks whether a character is a standard-width (hankaku) ASCII. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isHankakuAscii(char eval_char) { if(eval_char >= HANKAKU_ASCII_FIRST @@ -220,6 +305,12 @@ public static boolean isHankakuAscii(char eval_char) //{{{ boolean isZenkakuAscii(char) + /** + * Checks whether a character is a double-width (zenkaku) ASCII. + * @since 1.2.0 + * @param eval_char UTF-8 character to evaluate + * @return True or false + */ public static boolean isZenkakuAscii(char eval_char) { if(eval_char >= ZENKAKU_ASCII_FIRST diff --git a/src/main/java/com/mariten/kanatools/KanaConverter.java b/src/main/java/com/mariten/kanatools/KanaConverter.java index 82e19a0a..8e55f7aa 100644 --- a/src/main/java/com/mariten/kanatools/KanaConverter.java +++ b/src/main/java/com/mariten/kanatools/KanaConverter.java @@ -2,38 +2,127 @@ import com.mariten.kanatools.KanaAppraiser; import java.util.HashMap; +import java.util.HashSet; import java.util.Map; +import java.util.Set; /** - * Provides easy, automatic string conversions often necessary when dealing with Japanese text + * Easy back-and-forth conversion of kana, hankaku, zenkaku, and other characters used in Japanese text. * - * Port of PHP's "mb_convert_kana" function for Java. - * http://www.php.net/manual/en/function.mb-convert-kana.php + *

For example code, see the class details page

+ * + *

Perform multiple conversions on Kana and Roma-ji text with just a single static function call

+ * + * + * @author Jeff Case (mariten) */ public class KanaConverter { // Conversion Operations Types //// Matched numeric values to originals in PHP's source code //// https://github.com/php/php-src/blob/a84e5dc37dc0ff8c313164d9db141d3d9f2b2730/ext/mbstring/mbstring.c#L3434 + + /** + * Conversion Op Flag: Standard-width (hankaku) ASCII to double-width (zenkaku). + * See Conversion Op Guide for full details. + */ public static final int OP_HAN_ASCII_TO_ZEN_ASCII = 0x00000001; + + /** + * Conversion Op Flag: Standard-width (hankaku) alphabetic letters to double-width (zenkaku). + * See Conversion Op Guide for full details. + */ public static final int OP_HAN_LETTER_TO_ZEN_LETTER = 0x00000002; + + /** + * Conversion Op Flag: Standard-width (hankaku) numbers to double-width (zenkaku). + * See Conversion Op Guide for full details. + */ public static final int OP_HAN_NUMBER_TO_ZEN_NUMBER = 0x00000004; + + /** + * Conversion Op Flag: Standard-width (hankaku) spaces to double-width (zenkaku). + * See Conversion Op Guide for full details. + */ public static final int OP_HAN_SPACE_TO_ZEN_SPACE = 0x00000008; + + /** + * Conversion Op Flag: Half-width (hankaku) katakana to full-width (zenkaku). + * See Conversion Op Guide for full details. + */ public static final int OP_HAN_KATA_TO_ZEN_KATA = 0x00000100; + + /** + * Conversion Op Flag: Half-width (hankaku) katakana to full-width (zenkaku) hiragana. + * See Conversion Op Guide for full details. + */ public static final int OP_HAN_KATA_TO_ZEN_HIRA = 0x00000200; + + /** + * Conversion Op Flag: Keep hankaku katakana diacritic marks separate. + * See Conversion Op Guide for full details. + */ public static final int OP_KEEP_DIACRITIC_MARKS_APART = 0x00100000; + + /** + * Conversion Op Flag: Double-width (zenkaku) ASCII characters to standard-width (hankaku). + * See Conversion Op Guide for full details. + */ public static final int OP_ZEN_ASCII_TO_HAN_ASCII = 0x00000010; + + /** + * Conversion Op Flag: Double-width (zenkaku) alphabetic letters to standard-width (hankaku). + * See Conversion Op Guide for full details. + */ public static final int OP_ZEN_LETTER_TO_HAN_LETTER = 0x00000020; + + /** + * Conversion Op Flag: Double-width (zenkaku) numbers to standard-width (hankaku). + * See Conversion Op Guide for full details. + */ public static final int OP_ZEN_NUMBER_TO_HAN_NUMBER = 0x00000040; + + /** + * Conversion Op Flag: Double-width (zenkaku) spaces to standard-width (hankaku). + * See Conversion Op Guide for full details. + */ public static final int OP_ZEN_SPACE_TO_HAN_SPACE = 0x00000080; + + /** + * Conversion Op Flag: Full-width (zenkaku) katakana to half-width (hankaku). + * See Conversion Op Guide for full details. + */ public static final int OP_ZEN_KATA_TO_HAN_KATA = 0x00001000; + + /** + * Conversion Op Flag: Full-width (zenkaku) hirgana to half-width (hankaku) katakana. + * See Conversion Op Guide for full details. + */ public static final int OP_ZEN_HIRA_TO_HAN_KATA = 0x00002000; + + /** + * Conversion Op Flag: Full-width (zenkaku) hiragana to full-width (hankaku) katakana. + * See Conversion Op Guide for full details. + */ public static final int OP_ZEN_HIRA_TO_ZEN_KATA = 0x00010000; + + /** + * Conversion Op Flag: Full-width (zenkaku) katakana to full-width (zenkaku) hiragana. + * See Conversion Op Guide for full details. + */ public static final int OP_ZEN_KATA_TO_ZEN_HIRA = 0x00020000; + //// Maintain backwards compatibility (based on mb_convert_kana's "$option" parameter from PHP) //// Details: http://php.net/manual/en/function.mb-convert-kana.php - public static final Map LETTER_OP_CODE_LOOKUP; + protected static final Map LETTER_OP_CODE_LOOKUP; static { LETTER_OP_CODE_LOOKUP = new HashMap(); LETTER_OP_CODE_LOOKUP.put('A', OP_HAN_ASCII_TO_ZEN_ASCII); @@ -55,13 +144,16 @@ public class KanaConverter //{{{ String convertKana(String, int, String) /** - * Converts a string containing kana or other characters used in Japanese text input - * according to one or more requested conversion methods. + * Converts a string containing kana or other characters used in Japanese text input according to + * one or more requested conversion methods, and permits exclusion of certain characters. * - * @param original_string Input string to perform conversion on - * @param conversion_ops Flag-based integer indicating which type of conversions to perform - * @param chars_to_ignore Each character in this string will be excluded from conversion - * @return Content of "original_string" with specified conversions performed + * @since 1.1.0 + * @param original_string UTF-8 string to convert + * @param conversion_ops Flag-based integer indicating which type of conversion operations to perform + * by setting one or more "KanaConverter.OP_" constants; details on this parameter + * can be found here + * @param chars_to_ignore No characters in this string will be converted even if present in "original_string" + * @return UTF-8 string with all requested conversions applied */ public static String convertKana(String original_string, int conversion_ops, String chars_to_ignore) { @@ -82,6 +174,10 @@ public static String convertKana(String original_string, int conversion_ops, Str do_collapse_on_hankaku_diacritic = false; } + // Prepare excluded characters lookup + Set ignore_char_lookup = makeIgnoreCharLookup(chars_to_ignore); + + // Init data holders int char_count = original_string.length(); StringBuffer new_string = new StringBuffer(); int i = 0; @@ -96,8 +192,7 @@ public static String convertKana(String original_string, int conversion_ops, Str } // Skip all conversions if character is on the excluded chars list - boolean is_ignore_char = isIgnoreChar(current_char, chars_to_ignore); - if(is_ignore_char) { + if(ignore_char_lookup.contains(current_char)) { new_string.append(current_char); i++; continue; @@ -227,12 +322,15 @@ public static String convertKana(String original_string, int conversion_ops, Str //}}} //{{{ String convertKana(String, int) /** - * Converts a string containing kana or other characters used in Japanese text input - * according to one or more requested conversion methods. + * Converts a string containing kana or other characters used in Japanese text input according to + * one or more requested conversion methods. * - * @param original_string Input string to perform conversion on - * @param conversion_ops Flag-based integer indicating which type of conversions to perform - * @return Content of "original_string" with specified conversions performed + * @since 1.0.0 + * @param original_string UTF-8 string to convert + * @param conversion_ops Flag-based integer indicating which type of conversion operations to perform + * by setting one or more "KanaConverter.OP_" constants; details on this parameter + * can be found here + * @return UTF-8 string with all requested conversions applied */ public static String convertKana(String original_string, int conversion_ops) { @@ -241,13 +339,15 @@ public static String convertKana(String original_string, int conversion_ops) //}}} //{{{ String convertKana(String, String, String) /** - * Converts a string containing kana or other characters used in Japanese text input - * according to one or more requested conversion methods. + * Converts a string containing kana or other characters used in Japanese text input according to + * one or more requested conversion methods, and permits exclusion of certain characters. * - * @param original_string Input string to perform conversion on + * @deprecated + * @see #convertKana(String,int,String) + * @param original_string UTF-8 string to convert * @param conversion_ops_string PHP mb_convert_kana style string specifying desired conversions - * @param chars_to_ignore Each character in this string will be excluded from conversion - * @return Content of "original_string" with specified conversions performed + * @param chars_to_ignore No characters in this string will be converted even if present in "original_string" + * @return UTF-8 string with all requested conversions applied */ public static String convertKana(String original_string, String conversion_ops_string, String chars_to_ignore) { @@ -257,12 +357,14 @@ public static String convertKana(String original_string, String conversion_ops_s //}}} //{{{ String convertKana(String, String) /** - * Converts a string containing kana or other characters used in Japanese text input - * according to one or more requested conversion methods. + * Converts a string containing kana or other characters used in Japanese text input according to + * one or more requested conversion methods. * - * @param original_string Input string to perform conversion on + * @deprecated + * @see #convertKana(String,int) + * @param original_string UTF-8 string to convert * @param conversion_ops_string PHP mb_convert_kana style string specifying desired conversions - * @return Content of "original_string" with specified conversions performed + * @return UTF-8 string with all requested conversions applied */ public static String convertKana(String original_string, String conversion_ops_string) { @@ -273,8 +375,8 @@ public static String convertKana(String original_string, String conversion_ops_s //{{{ Hankaku Katakana related mappings // Diacritic constants - public static final char HANKAKU_VOICED_MARK = '゙'; // dakuten - public static final char HANKAKU_ASPIRATED_MARK = '゚'; // handakuten + protected static final char HANKAKU_VOICED_MARK = '゙'; // dakuten + protected static final char HANKAKU_ASPIRATED_MARK = '゚'; // handakuten protected static final Map MAPPING_HANKAKU_TO_ZENKAKU_KATAKANA_UNVOICED; static { @@ -510,11 +612,11 @@ public static String convertKana(String original_string, String conversion_ops_s // Connect mapping of hiragana and katakana char codes - public static final int OFFSET_ZENKAKU_HIRAGANA_TO_ZENKAKU_KATAKANA = + protected static final int OFFSET_ZENKAKU_HIRAGANA_TO_ZENKAKU_KATAKANA = (KanaAppraiser.ZENKAKU_KATAKANA_FIRST - KanaAppraiser.ZENKAKU_HIRAGANA_FIRST); // Connect mapping of regular ASCII characters to Zenkaku ASCII characters - public static final int OFFSET_HANKAKU_ASCII_TO_ZENKAKU_ASCII = + protected static final int OFFSET_HANKAKU_ASCII_TO_ZENKAKU_ASCII = (KanaAppraiser.ZENKAKU_ASCII_FIRST - KanaAppraiser.HANKAKU_ASCII_FIRST); @@ -704,19 +806,15 @@ protected static char convertZenkakuSpaceToHankakuSpace(char target) //}}} - //{{{ boolean isIgnoreChar(char, String) - protected static boolean isIgnoreChar(char char_to_check, String chars_to_ignore) + //{{{ boolean makeIgnoreCharLookup(String) + protected static Set makeIgnoreCharLookup(String chars_to_ignore) { + Set lookup_hash = new HashSet(); int ignore_char_count = chars_to_ignore.length(); for(int i = 0; i < ignore_char_count; i++) { - if(char_to_check == chars_to_ignore.charAt(i)) { - // Matched - return true; - } + lookup_hash.add(chars_to_ignore.charAt(i)); } - - // No matches - return false; + return lookup_hash; } //}}}