diff --git a/README.md b/README.md index d3cfcb5..2ee3e5a 100644 --- a/README.md +++ b/README.md @@ -4,8 +4,7 @@ BoGo [![Build Status](https://travis-ci.org/BoGoEngine/bogo-python.svg?branch=master)](https://travis-ci.org/BoGoEngine/bogo-python) [![Coverage Status](https://coveralls.io/repos/BoGoEngine/bogo-python/badge.png?branch=master)](https://coveralls.io/r/BoGoEngine/bogo-python?branch=master) -BoGo is a Vietnamese input method conversion library for Python. This library -is intentionally functional with no internal state and side-effect. +BoGo is a Vietnamese input method conversion library for Python. Installation ------------ diff --git a/bogo/__init__.py b/bogo/__init__.py index d507d7c..814a955 100644 --- a/bogo/__init__.py +++ b/bogo/__init__.py @@ -25,7 +25,6 @@ """ from bogo.core import \ - process_key, \ process_sequence, \ get_telex_definition, \ get_vni_definition diff --git a/bogo/accent.py b/bogo/accent.py index 832f0ba..f703e5b 100644 --- a/bogo/accent.py +++ b/bogo/accent.py @@ -3,7 +3,7 @@ # This file is part of ibus-bogo project. # # Copyright (C) 2012 Long T. Dam -# Copyright (C) 2012-2013 Trung Ngo +# Copyright (C) 2012-2014 Trung Ngo # Copyright (C) 2013 Duong H. Nguyen # # ibus-bogo is free software: you can redistribute it and/or modify @@ -21,7 +21,7 @@ # """ -Utility functions to deal with accents (should have been called tones), +Utility functions to deal with accents (also called tones), which are diacritical markings that changes the pitch of a character. E.g. the acute accent in á. """ @@ -31,9 +31,11 @@ from __future__ import unicode_literals from bogo import utils +from bogo.syllable import Syllable class Accent: + MAX_VALUE = 6 GRAVE = 5 ACUTE = 4 HOOK = 3 @@ -62,61 +64,65 @@ def get_accent_string(string): return accents[-1] if accents else Accent.NONE -def add_accent(components, accent): +def add_accent(syllable, accent): """ - Add accent to the given components. The parameter components is - the result of function separate() + Add accent to the given syllable. """ - vowel = components[1] - last_consonant = components[2] + vowel = syllable.vowel + + if not vowel: + return syllable + if accent == Accent.NONE: vowel = remove_accent_string(vowel) - return [components[0], vowel, last_consonant] + return Syllable(syllable.initial_consonant, vowel, syllable.final_consonant) - if vowel == "": - return components - #raw_string is a list, not a str object - raw_string = remove_accent_string(vowel).lower() - new_vowel = "" + vowel_wo_accent = remove_accent_string(vowel).lower() + new_vowel = '' + # Highest priority for ê and ơ - index = max(raw_string.find("ê"), raw_string.find("ơ")) - if index != -1: - new_vowel = vowel[:index] + add_accent_char(vowel[index], accent) + vowel[index+1:] - elif len(vowel) == 1 or (len(vowel) == 2 and last_consonant == ""): - new_vowel = add_accent_char(vowel[0], accent) + vowel[1:] + index = max(vowel_wo_accent.find("ê"), vowel_wo_accent.find("ơ")) + found_e_hat_or_o_horn = index != -1 + + if found_e_hat_or_o_horn: + # Add accent mark to the found ê or ơ + new_vowel = \ + vowel[:index] + \ + add_accent_char(vowel[index], accent) + \ + vowel[index + 1:] + elif len(vowel) == 1 or (len(vowel) == 2 and not syllable.final_consonant): + # cá + # cháo + first_vowel_char = vowel[0] + first_vowel_char_with_accent = add_accent_char(first_vowel_char, accent) + new_vowel = first_vowel_char_with_accent + vowel[1:] else: - new_vowel = vowel[:1] + add_accent_char(vowel[1], accent) + vowel[2:] - return [components[0], new_vowel, components[2]] + # biến + # khuỷu + second_vowel_char = vowel[1] + second_vowel_char_with_accent = add_accent_char(second_vowel_char, accent) + new_vowel = vowel[:1] + second_vowel_char_with_accent + vowel[2:] + return Syllable(syllable.initial_consonant, new_vowel, syllable.final_consonant) + +@utils.keep_case def add_accent_char(char, accent): """ - Add accent to a single char. Parameter accent is member of class - Accent + Add accent to a single char. + + Args: + accent: an Accent enum value """ - if char == "": - return "" - case = char.isupper() - char = char.lower() + if not (char and accent in range(0, Accent.MAX_VALUE + 1)): + return char + index = utils.VOWELS.find(char) if (index != -1): index = index - index % 6 + 5 char = utils.VOWELS[index - accent] - return utils.change_case(char, case) - -def add_accent_at(string, accent, index): - """ - Add mark to the index-th character of the given string. Return - the new string after applying change. - (unused) - """ - if index == -1: - return string - # Python can handle the case which index is out of range of given string - return string[:index] + \ - accent.accent.add_accent_char(string[index], accent) + \ - string[index+1:] + return char def remove_accent_char(char): diff --git a/bogo/core.py b/bogo/core.py index 697e3ba..2a3f300 100644 --- a/bogo/core.py +++ b/bogo/core.py @@ -25,9 +25,8 @@ """ from __future__ import unicode_literals -from bogo.validation import is_valid_combination -from bogo import utils, accent, mark -import logging +from bogo.syllable import Syllable +from bogo import accent, mark, validation import sys import string @@ -55,11 +54,11 @@ def get_telex_definition(w_shorthand=True, brackets_shorthand=True): Returns a dictionary to be passed into process_key(). """ telex = { - "a": "a^", - "o": "o^", - "e": "e^", - "w": ["u*", "o*", "a+"], - "d": "d-", + "a": "^", + "o": "^", + "e": "^", + "w": ["*", "("], + "d": "-", "f": "\\", "s": "/", "r": "?", @@ -127,365 +126,253 @@ def process_sequence(sequence, i.e. process_sequence('con meof.ddieen') should work. """ result = "" - raw = result - result_parts = [] + result_chunks = [] if rules is None: rules = get_telex_definition() accepted_chars = _accepted_chars(rules) + rules = Rule(rules) + bg = BoGo(rules) for key in sequence: if key not in accepted_chars: - result_parts.append(result) - result_parts.append(key) + result_chunks.append(result) + result_chunks.append(key) result = "" - raw = "" + bg = BoGo(rules) else: - result, raw = process_key( - string=result, - key=key, - fallback_sequence=raw, - rules=rules, - skip_non_vietnamese=skip_non_vietnamese) + result = bg.add_key(key) + if not validation.is_valid_string(result): + result = bg.raw_string() - result_parts.append(result) - return ''.join(result_parts) + result_chunks.append(result) + return ''.join(result_chunks) -def process_key(string, key, - fallback_sequence="", rules=None, - skip_non_vietnamese=True): - """Process a keystroke. +class Transformation: - Args: - string: The previously processed string or "". - key: The keystroke. - fallback_sequence: The previous keystrokes. - rules (optional): A dictionary listing - transformation rules. Defaults to get_telex_definition(). - skip_non_vietnamese (optional): Whether to skip results that - doesn't seem like Vietnamese. Defaults to True. - - Returns a tuple. The first item of which is the processed - Vietnamese string, the second item is the next fallback sequence. - The two items are to be fed back into the next call of process_key() - as `string` and `fallback_sequence`. If `skip_non_vietnamese` is - True and the resulting string doesn't look like Vietnamese, - both items contain the `fallback_sequence`. - - >>> process_key('a', 'a', 'a') - (â, aa) - - Note that when a key is an undo key, it won't get appended to - `fallback_sequence`. - - >>> process_key('â', 'a', 'aa') - (aa, aa) - - `rules` is a dictionary that maps keystrokes to - their effect string. The effects can be one of the following: - - 'a^': a with circumflex (â), only affect an existing 'a family' - 'a+': a with breve (ă), only affect an existing 'a family' - 'e^': e with circumflex (ê), only affect an existing 'e family' - 'o^': o with circumflex (ô), only affect an existing 'o family' - 'o*': o with horn (ơ), only affect an existing 'o family' - 'd-': d with bar (đ), only affect an existing 'd' - '/': acute (sắc), affect an existing vowel - '\': grave (huyền), affect an existing vowel - '?': hook (hỏi), affect an existing vowel - '~': tilde (ngã), affect an existing vowel - '.': dot (nặng), affect an existing vowel - '<ư': append ư - '<ơ': append ơ - - A keystroke entry can have multiple effects, in which case the - dictionary entry's value should be a list of the possible - effect strings. Although you should try to avoid this if - you are defining a custom input method rule. - """ - # TODO Figure out a way to remove the `string` argument. Perhaps only the - # key sequence is needed? - def default_return(): - return string + key, fallback_sequence + key + def __init__(self, key): + self.key = key - if rules is None: - rules = get_telex_definition() + def perform(self, syllable): + raise NotImplementedError - comps = utils.separate(string) - - # if not _is_processable(comps): - # return default_return() - - # Find all possible transformations this keypress can generate - trans_list = _get_transformation_list( - key, rules, fallback_sequence) - - # Then apply them one by one - new_comps = list(comps) - for trans in trans_list: - new_comps = _transform(new_comps, trans) - - if new_comps == comps: - tmp = list(new_comps) - - # If none of the transformations (if any) work - # then this keystroke is probably an undo key. - if _can_undo(new_comps, trans_list): - # The prefix "_" means undo. - for trans in map(lambda x: "_" + x, trans_list): - new_comps = _transform(new_comps, trans) - - # Undoing the w key with the TELEX input method with the - # w:<ư extension requires some care. - # - # The input (ư, w) should be undone as w - # on the other hand, (ư, uw) should return uw. - # - # _transform() is not aware of the 2 ways to generate - # ư in TELEX and always think ư was created by uw. - # Therefore, after calling _transform() to undo ư, - # we always get ['', 'u', '']. - # - # So we have to clean it up a bit. - def is_telex_like(): - return '<ư' in rules["w"] - - def undone_vowel_ends_with_u(): - return new_comps[1] and new_comps[1][-1].lower() == "u" - - def not_first_key_press(): - return len(fallback_sequence) >= 1 - - def user_typed_ww(): - return (fallback_sequence[-1:]+key).lower() == "ww" - - def user_didnt_type_uww(): - return not (len(fallback_sequence) >= 2 and - fallback_sequence[-2].lower() == "u") - - if is_telex_like() and \ - not_first_key_press() and \ - undone_vowel_ends_with_u() and \ - user_typed_ww() and \ - user_didnt_type_uww(): - # The vowel part of new_comps is supposed to end with - # u now. That u should be removed. - new_comps[1] = new_comps[1][:-1] - - if tmp == new_comps: - fallback_sequence += key - new_comps = utils.append_comps(new_comps, key) - else: - fallback_sequence += key - if skip_non_vietnamese is True and key.isalpha() and \ - not is_valid_combination(new_comps, final_form=False): - result = fallback_sequence, fallback_sequence - else: - result = utils.join(new_comps), fallback_sequence +class AddCharTransformation(Transformation): + def __init__(self, key, char): + super(AddCharTransformation, self).__init__(key) + self.char = char - return result + def perform(self, syllable): + return syllable.append_char(self.char) -def _get_transformation_list(key, im, fallback_sequence): - """ - Return the list of transformations inferred from the entered key. The - map between transform types and keys is given by module - bogo_config (if exists) or by variable simple_telex_im +class ByPassTransformation(AddCharTransformation): + def __init__(self, key): + super(ByPassTransformation, self).__init__(key, key) - if entered key is not in im, return "+key", meaning appending - the entered key to current text - """ - # if key in im: - # lkey = key - # else: - # lkey = key.lower() - lkey = key.lower() - - if lkey in im: - if isinstance(im[lkey], list): - trans_list = im[lkey] - else: - trans_list = [im[lkey]] - - for i, trans in enumerate(trans_list): - if trans[0] == '<' and key.isalpha(): - trans_list[i] = trans[0] + \ - utils.change_case(trans[1], int(key.isupper())) - - if trans_list == ['_']: - if len(fallback_sequence) >= 2: - # TODO Use takewhile()/dropwhile() to process the last IM keypress - # instead of assuming it's the last key in fallback_sequence. - t = list(map(lambda x: "_" + x, - _get_transformation_list(fallback_sequence[-2], im, - fallback_sequence[:-1]))) - # print(t) - trans_list = t - # else: - # trans_list = ['+' + key] - - return trans_list - else: - return ['+' + key] +class AddToneMarkTransformation(Transformation): + def __init__(self, key, tone): + super(AddToneMarkTransformation, self).__init__(key) + self.tone = tone -def _get_action(trans): - """ - Return the action inferred from the transformation `trans`. - and the parameter going with this action - An _Action.ADD_MARK goes with a Mark - while an _Action.ADD_ACCENT goes with an Accent - """ - # TODO: VIQR-like convention - mark_action = { - '^': (_Action.ADD_MARK, Mark.HAT), - '+': (_Action.ADD_MARK, Mark.BREVE), - '*': (_Action.ADD_MARK, Mark.HORN), - '-': (_Action.ADD_MARK, Mark.BAR), - } + def perform(self, syllable): + return accent.add_accent(syllable, self.tone) - accent_action = { - '\\': (_Action.ADD_ACCENT, Accent.GRAVE), - '/': (_Action.ADD_ACCENT, Accent.ACUTE), - '?': (_Action.ADD_ACCENT, Accent.HOOK), - '~': (_Action.ADD_ACCENT, Accent.TIDLE), - '.': (_Action.ADD_ACCENT, Accent.DOT), - } - if trans[0] in ('<', '+'): - return _Action.ADD_CHAR, trans[1] - if trans[0] == "_": - return _Action.UNDO, trans[1:] - if len(trans) == 2: - return mark_action[trans[1]] - else: - return accent_action[trans[0]] +class AddCharMarkTransformation(Transformation): + def __init__(self, key, mark): + super(AddCharMarkTransformation, self).__init__(key) + self.mark = mark + def perform(self, syllable): + return mark.add_mark(syllable, self.mark) -def _transform(comps, trans): - """ - Transform the given string with transform type trans - """ - logging.debug("== In _transform(%s, %s) ==", comps, trans) - components = list(comps) - - action, parameter = _get_action(trans) - if action == _Action.ADD_MARK and \ - components[2] == "" and \ - mark.strip(components[1]).lower() in ['oe', 'oa'] and trans == "o^": - action, parameter = _Action.ADD_CHAR, trans[0] - - if action == _Action.ADD_ACCENT: - logging.debug("add_accent(%s, %s)", components, parameter) - components = accent.add_accent(components, parameter) - elif action == _Action.ADD_MARK and mark.is_valid_mark(components, trans): - logging.debug("add_mark(%s, %s)", components, parameter) - components = mark.add_mark(components, parameter) - - # Handle uơ in "huơ", "thuở", "quở" - # If the current word has no last consonant and the first consonant - # is one of "h", "th" and the vowel is "ươ" then change the vowel into - # "uơ", keeping case and accent. If an alphabet character is then added - # into the word then change back to "ươ". - # - # NOTE: In the dictionary, these are the only words having this strange - # vowel so we don't need to worry about other cases. - if accent.remove_accent_string(components[1]).lower() == "ươ" and \ - not components[2] and components[0].lower() in ["", "h", "th", "kh"]: - # Backup accents - ac = accent.get_accent_string(components[1]) - components[1] = ("u", "U")[components[1][0].isupper()] + components[1][1] - components = accent.add_accent(components, ac) - - elif action == _Action.ADD_CHAR: - if trans[0] == "<": - if not components[2]: - # Only allow ư, ơ or ươ sitting alone in the middle part - # and ['g', 'i', '']. If we want to type giowf = 'giờ', separate() - # will create ['g', 'i', '']. Therefore we have to allow - # components[1] == 'i'. - if (components[0].lower(), components[1].lower()) == ('g', 'i'): - components[0] += components[1] - components[1] = '' - if not components[1] or \ - (components[1].lower(), trans[1].lower()) == ('ư', 'ơ'): - components[1] += trans[1] - else: - components = utils.append_comps(components, parameter) - if parameter.isalpha() and \ - accent.remove_accent_string(components[1]).lower().startswith("uơ"): - ac = accent.get_accent_string(components[1]) - components[1] = ('ư', 'Ư')[components[1][0].isupper()] + \ - ('ơ', 'Ơ')[components[1][1].isupper()] + components[1][2:] - components = accent.add_accent(components, ac) - elif action == _Action.UNDO: - components = _reverse(components, trans[1:]) - - if action == _Action.ADD_MARK or (action == _Action.ADD_CHAR and parameter.isalpha()): - # If there is any accent, remove and reapply it - # because it is likely to be misplaced in previous transformations - ac = accent.get_accent_string(components[1]) - - if ac != accent.Accent.NONE: - components = accent.add_accent(components, Accent.NONE) - components = accent.add_accent(components, ac) - - logging.debug("After transform: %s", components) - return components - - -def _reverse(components, trans): - """ - Reverse the effect of transformation 'trans' on 'components' - If the transformation does not affect the components, return the original - string. - """ - action, parameter = _get_action(trans) - comps = list(components) - string = utils.join(comps) +class UndoAddToneMarkTransformation(Transformation): + def __init__(self, other_trans): + super(UndoAddToneMarkTransformation, self).__init__(other_trans.key) + self.tone = other_trans.tone - if action == _Action.ADD_CHAR and string[-1].lower() == parameter.lower(): - if comps[2]: - i = 2 - elif comps[1]: - i = 1 - else: - i = 0 - comps[i] = comps[i][:-1] - elif action == _Action.ADD_ACCENT: - comps = accent.add_accent(comps, Accent.NONE) - elif action == _Action.ADD_MARK: - if parameter == Mark.BAR: - comps[0] = comps[0][:-1] + \ - mark.add_mark_char(comps[0][-1:], Mark.NONE) + def perform(self, syllable): + # A proper syllable has exactly one tone. Undoing the + # last tone means removing all tones. + return accent.add_accent(syllable, Accent.NONE) + + +class UndoAddCharMarkTransformation(Transformation): + def __init__(self, other_trans): + super(UndoAddCharMarkTransformation, self).__init__(other_trans.key) + self.mark = other_trans.mark + self.other_trans = other_trans + + def perform(self, syllable): + new_initial_consonant, new_vowel, new_final_consonant = syllable + + if self.mark == Mark.BAR: + new_initial_consonant = syllable.initial_consonant[:-1] + \ + mark.add_mark_char(syllable.initial_consonant[-1:], Mark.NONE) else: - if mark.is_valid_mark(comps, trans): - comps[1] = "".join([mark.add_mark_char(c, Mark.NONE) - for c in comps[1]]) - return comps + new_vowel = "".join([mark.add_mark_char(c, Mark.NONE) + for c in new_vowel]) + return Syllable(new_initial_consonant, new_vowel, new_final_consonant) -def _can_undo(comps, trans_list): - """ - Return whether a components can be undone with one of the transformation in - trans_list. - """ - comps = list(comps) - accent_list = list(map(accent.get_accent_char, comps[1])) - mark_list = list(map(mark.get_mark_char, utils.join(comps))) - action_list = list(map(lambda x: _get_action(x), trans_list)) - - def atomic_check(action): - """ - Check if the `action` created one of the marks, accents, or characters - in `comps`. - """ - return (action[0] == _Action.ADD_ACCENT and action[1] in accent_list) \ - or (action[0] == _Action.ADD_MARK and action[1] in mark_list) \ - or (action[0] == _Action.ADD_CHAR and action[1] == \ - accent.remove_accent_char(comps[1][-1])) # ơ, ư - - return any(map(atomic_check, action_list)) + +class UndoAddCharTransformation(Transformation): + def __init__(self, other_trans): + super(UndoAddCharTransformation, self).__init__(other_trans.key) + + def perform(self, syllable): + # Just remove the last char + return Syllable.new_from_string(syllable.as_string()[:-1]) + + +def undo_transformation_of(trans): + if type(trans) is AddToneMarkTransformation: + return UndoAddToneMarkTransformation(trans) + elif type(trans) is AddCharMarkTransformation: + return UndoAddCharMarkTransformation(trans) + elif type(trans) is AddCharTransformation: + return UndoAddCharTransformation(trans) + + +class Rule: + mark_action = { + '^': Mark.HAT, + '(': Mark.BREVE, + '*': Mark.HORN, + '-': Mark.BAR, + } + + accent_action = { + '\\': Accent.GRAVE, + '/': Accent.ACUTE, + '?': Accent.HOOK, + '~': Accent.TIDLE, + '.': Accent.DOT, + } + + def __init__(self, rule_dict): + self.rule_dict = rule_dict + + @staticmethod + def parse_rule_action(rule_action, key): + # Each typing rule consists of 2 parts: a predicate and + # an action associated with that predicate. + # e.g.: In the rule 'r -> ?', 'r' is the key, the predicate, + # and ? is the action (add a HOOK tone mark to the + # suitable vowel). + + if rule_action[0] == '<': + # <ư + print(rule_action) + trans = AddCharTransformation(key, rule_action[1]) + # elif rule_action[0] == "_": + # # _a^ + # trans = Transformation(_Action.UNDO, rule_action[1:]) + elif rule_action in Rule.mark_action: + # ^ + trans = AddCharMarkTransformation( + key, Rule.mark_action[rule_action]) + elif rule_action in Rule.accent_action: + # ? + trans = AddToneMarkTransformation( + key, Rule.accent_action[rule_action]) + else: + # TODO ? + raise ValueError + + if type(trans) is AddCharTransformation: + if key.isupper(): + trans.key = trans.key.toupper() + + return trans + + def possible_transformations_from(self, key): + if key in self.rule_dict: + if type(self.rule_dict[key]) is list: + transformations = \ + [self.parse_rule_action(rule_action, key) + for rule_action in self.rule_dict[key]] + else: + transformations = \ + [self.parse_rule_action(self.rule_dict[key], key)] + + return transformations + [ByPassTransformation(key)] + else: + return [ByPassTransformation(key)] + + +class BoGo: + def __init__(self, typing_rule): + self.rule = typing_rule + self.transformations = [] + self.syllable = Syllable('', '', '') + self.undone_keys = [] + + def raw_string(self): + return "".join([trans.key for trans in self.transformations]) + + def result(self): + return self.syllable.as_string() + + def best_transformation(self, key): + if key in self.undone_keys: + return ByPassTransformation(key) + + transformations = self.rule.possible_transformations_from(key) + + # Check if the same key has been used before in a transformation. + # If it has then undo it. + for trans in self.transformations: + if type(trans) is not ByPassTransformation and \ + trans.key == key: + self.undone_keys.append(key) + return undo_transformation_of(trans) + + for transformation in transformations: + if type(transformation) is AddToneMarkTransformation: + if self.syllable.vowel is not "": + return transformation + elif type(transformation) is AddCharMarkTransformation: + if transformation.mark == Mark.HAT: + if self.syllable.vowel is not "" and \ + self.syllable.vowel[-1] in \ + ('a', 'ă', 'â', 'o', 'ô', 'ơ', 'e', 'ê'): + return transformation + elif transformation.mark == Mark.BREVE: + if self.syllable.vowel is not "" and \ + mark.strip(self.syllable.vowel) is 'a': + return transformation + elif transformation.mark == Mark.HORN: + v = mark.strip(self.syllable.vowel) + if v is not "" and \ + ('u' in v or 'o' in v): + return transformation + elif transformation.mark == Mark.BAR: + ic = self.syllable.initial_consonant + if ic is not "" and \ + mark.strip(ic)[-1] is 'd': + return transformation + elif type(transformation) is AddCharTransformation: + return transformation + elif type(transformation) is ByPassTransformation: + return transformation + + def add_key(self, key): + transformation = self.best_transformation(key) + + # Recreate the syllable to fix the "gio" key sequence. + # After the 2 first keys, the syllable will have the form + # of ('g', 'i', ''). We want it to have the form + # ('gi', 'o', '') after the 3rd key. + self.syllable = Syllable.new_from_string(self.result()) + self.syllable = transformation.perform(self.syllable) + + if transformation.__class__.__name__.startswith("Undo"): + self.syllable = self.syllable.append_char(key) + + self.transformations.append(transformation) + return self.result() diff --git a/bogo/mark.py b/bogo/mark.py index bdc63f7..88ef556 100644 --- a/bogo/mark.py +++ b/bogo/mark.py @@ -29,6 +29,7 @@ from __future__ import unicode_literals from bogo import accent, utils +from bogo.syllable import Syllable Accent = accent.Accent @@ -66,35 +67,44 @@ def get_mark_char(char): # TODO: Monstrous code. Needs refactoring. -def add_mark(components, mark): - comp = list(components) - if mark == Mark.BAR and comp[0] and comp[0][-1].lower() in FAMILY_D: - comp[0] = add_mark_at(comp[0], len(comp[0])-1, Mark.BAR) +def add_mark(syllable, mark): + new_initial_consonant, new_vowel, new_final_consonant = syllable + + if mark == Mark.BAR and \ + syllable.initial_consonant and \ + syllable.initial_consonant[-1].lower() in FAMILY_D: + new_initial_consonant = add_mark_at( + syllable.initial_consonant, + len(syllable.initial_consonant) - 1, + Mark.BAR) else: - #remove all marks and accents in vowel part - raw_vowel = accent.add_accent(comp, Accent.NONE)[1].lower() - raw_vowel = utils.join([add_mark_char(c, Mark.NONE) for c in raw_vowel]) + raw_vowel = strip(syllable.vowel).lower() if mark == Mark.HAT: pos = max(raw_vowel.find("a"), raw_vowel.find("o"), raw_vowel.find("e")) - comp[1] = add_mark_at(comp[1], pos, Mark.HAT) + new_vowel = add_mark_at(syllable.vowel, pos, Mark.HAT) elif mark == Mark.BREVE: if raw_vowel != "ua": - comp[1] = add_mark_at(comp[1], raw_vowel.find("a"), Mark.BREVE) + new_vowel = add_mark_at( + syllable.vowel, raw_vowel.find("a"), Mark.BREVE) elif mark == Mark.HORN: if raw_vowel in ("uo", "uoi", "uou"): - comp[1] = utils.join([add_mark_char(c, Mark.HORN) for c in comp[1][:2]]) + comp[1][2:] + new_vowel = "".join( + [add_mark_char(c, Mark.HORN) for c in syllable.vowel[:2]]) \ + + syllable.vowel[2:] elif raw_vowel == "oa": - comp[1] = add_mark_at(comp[1], 1, Mark.HORN) + new_vowel = add_mark_at(syllable.vowel, 1, Mark.HORN) else: pos = max(raw_vowel.find(""), raw_vowel.find("o")) - comp[1] = add_mark_at(comp[1], pos, Mark.HORN) + new_vowel = add_mark_at(syllable.vowel, pos, Mark.HORN) if mark == Mark.NONE: - if not raw_vowel == comp[1].lower(): - comp[1] = raw_vowel - elif comp[0] and comp[0][-1] == "đ": - comp[0] = comp[0][:-1] + "d" - return comp + if not raw_vowel == syllable.vowel.lower(): + new_vowel = raw_vowel + elif syllable.initial_consonant and \ + syllable.initial_consonant[-1] == "đ": + new_initial_consonant = syllable.initial_consonant[:-1] + "d" + + return Syllable(new_initial_consonant, new_vowel, new_final_consonant) def add_mark_at(string, index, mark): @@ -152,19 +162,18 @@ def add_mark_char(char, mark): return utils.change_case(new_char, case) -def is_valid_mark(comps, mark_trans): +def is_valid_mark(syllable, mark_trans): """ Check whether the mark given by mark_trans is valid to add to the components """ if mark_trans == "*_": return True - components = list(comps) - if mark_trans[0] == 'd' and components[0] \ - and components[0][-1].lower() in ("d", "đ"): + if mark_trans[0] == 'd' and syllable.initial_consonant \ + and syllable.initial_consonant[-1].lower() in ("d", "đ"): return True - elif components[1] != "" and \ - strip(components[1]).lower().find(mark_trans[0]) != -1: + elif syllable.vowel and \ + strip(syllable.vowel).lower().find(mark_trans[0]) != -1: return True else: return False diff --git a/bogo/syllable.py b/bogo/syllable.py new file mode 100644 index 0000000..8abae5b --- /dev/null +++ b/bogo/syllable.py @@ -0,0 +1,84 @@ +import collections +from bogo import utils + + +class Syllable(collections.namedtuple('Syllable', + ['initial_consonant', 'vowel', 'final_consonant'])): + + @staticmethod + def new_from_string(string): + """\ + Make a Syllable from a string. + + Args: + - string: the string to be parsed + + Returns: + a Syllable + + >>> parse_syllable('tuong') + ('t','uo','ng') + >>> parse_syllable('ohmyfkinggod') + ('ohmyfkingg','o','d') + """ + def atomic_separate(string, last_chars, last_is_vowel): + if string == "" or (last_is_vowel != utils.is_vowel(string[-1])): + return (string, last_chars) + else: + return atomic_separate(string[:-1], + string[-1] + last_chars, last_is_vowel) + + head, last_consonant = atomic_separate(string, "", False) + first_consonant, vowel = atomic_separate(head, "", True) + + if last_consonant and not (vowel + first_consonant): + first_consonant = last_consonant + last_consonant = '' + + # 'gi' and 'qu' are considered qualified consonants. + # We want something like this: + # ['g', 'ia', ''] -> ['gi', 'a', ''] + # ['q', 'ua', ''] -> ['qu', 'a', ''] + if len(vowel) > 1 and \ + (first_consonant + vowel[0]).lower() in ['gi', 'qu']: + first_consonant += vowel[0] + vowel = vowel[1:] + + return Syllable(first_consonant, vowel, last_consonant) + + def as_string(self): + return self.initial_consonant + self.vowel + self.final_consonant + + def append_char(self, char): + """ + Append a character to `comps` following this rule: a vowel is added + to the vowel part if there is no last consonant, else to the last + consonant part; a consonant is added to the first consonant part + if there is no vowel, and to the last consonant part if the + vowel part is not empty. + + >>> transform(['', '', '']) + ['c', '', ''] + >>> transform(['c', '', ''], '+o') + ['c', 'o', ''] + >>> transform(['c', 'o', ''], '+n') + ['c', 'o', 'n'] + >>> transform(['c', 'o', 'n'], '+o') + ['c', 'o', 'no'] + """ + initial_consonant = self.initial_consonant + vowel = self.vowel + final_consonant = self.final_consonant + + if utils.is_vowel(char): + if not self.final_consonant: + vowel = self.vowel + char + else: + final_consonant = self.final_consonant + char + else: + if not self.final_consonant and not self.vowel: + initial_consonant = self.initial_consonant + char + else: + final_consonant = self.final_consonant + char + + return Syllable(initial_consonant, vowel, final_consonant) diff --git a/bogo/test/test_accent.py b/bogo/test/test_accent.py index 4a574b3..f814a2c 100644 --- a/bogo/test/test_accent.py +++ b/bogo/test/test_accent.py @@ -1,3 +1,86 @@ # -*- coding: utf-8 -*- from __future__ import unicode_literals +from nose.tools import eq_ +from bogo.accent import add_accent, add_accent_char, Accent +from bogo.syllable import Syllable + + +class TestAddAccentChar(): + + def test_empty_char(self): + result = add_accent_char('', Accent.GRAVE) + expected = '' + eq_(result, expected) + + def test_out_of_range_accent(self): + result = add_accent_char('a', 293432) + expected = 'a' + eq_(result, expected) + + def test_normal_accent(self): + result = add_accent_char('a', Accent.ACUTE) + expected = 'á' + eq_(result, expected) + + def test_upper_case(self): + eq_(add_accent_char('A', Accent.ACUTE), 'Á') + + +class TestAddAccent(): + def test_remove_accent(self): + s = Syllable('c', 'á', 'c') + + result = add_accent(s, Accent.NONE) + expected = Syllable('c', 'a', 'c') + + eq_(result, expected) + + def test_e_hat(self): + s = Syllable('ch', 'uyê', 'n') + + result = add_accent(s, Accent.HOOK) + expected = Syllable('ch', 'uyể', 'n') + + eq_(result, expected) + + def test_o_horn(self): + s = Syllable('ch', 'ươ', 'ng') + + result = add_accent(s, Accent.HOOK) + expected = Syllable('ch', 'ưở', 'ng') + + eq_(result, expected) + + def test_double_vowel_no_final_consonant(self): + s = Syllable('c', 'ua', '') + + result = add_accent(s, Accent.HOOK) + expected = Syllable('c', 'ủa', '') + + eq_(result, expected) + + def test_double_vowel_with_final_consonant(self): + s = Syllable('c', 'uô', 'ng') + + result = add_accent(s, Accent.GRAVE) + expected = Syllable('c', 'uồ', 'ng') + + eq_(result, expected) + + def test_single_vowel(self): + s = Syllable('c', 'a', '') + + result = add_accent(s, Accent.ACUTE) + expected = Syllable('c', 'á', '') + + eq_(result, expected) + + s = Syllable('c', 'a', 'n') + + result = add_accent(s, Accent.ACUTE) + expected = Syllable('c', 'á', 'n') + + eq_(result, expected) + + \ No newline at end of file diff --git a/bogo/test/test_core.py b/bogo/test/test_core.py new file mode 100644 index 0000000..dc4d414 --- /dev/null +++ b/bogo/test/test_core.py @@ -0,0 +1,107 @@ +from __future__ import unicode_literals +from nose.tools import eq_ +from bogo import core +from bogo.syllable import Syllable +from bogo import accent +from bogo import mark + + +class TestAddCharTransformation: + + def test_add_simple_char(self): + t = core.AddCharTransformation('a', 'a') + syl = Syllable('', '', '') + + result = t.perform(syl) + + eq_(result, Syllable('', 'a', '')) + + +class TestAddToneMarkTransformation: + + def test_add_simple_tone(self): + trans = core.AddToneMarkTransformation('s', accent.Accent.ACUTE) + syl = Syllable('', 'a', '') + result = trans.perform(syl) + + eq_(result, Syllable('', 'á', '')) + + +class TestAddCharMarkTransformation: + + def test_add_simple_mark(self): + trans = core.AddCharMarkTransformation('a', mark.Mark.HAT) + syl = Syllable('', 'a', '') + result = trans.perform(syl) + + eq_(result, Syllable('', 'â', '')) + + +class TestRule: + + def test_parse_add_char(self): + result = core.Rule.parse_rule_action('<ư', 'w') + + eq_(type(result), core.AddCharTransformation) + eq_(result.char, 'ư') + eq_(result.key, 'w') + + def test_parse_add_tone(self): + result = core.Rule.parse_rule_action('?', 'r') + + eq_(type(result), core.AddToneMarkTransformation) + eq_(result.tone, accent.Accent.HOOK) + eq_(result.key, 'r') + + def test_parse_add_mark(self): + result = core.Rule.parse_rule_action('^', 'a') + + eq_(type(result), core.AddCharMarkTransformation) + eq_(result.mark, mark.Mark.HAT) + eq_(result.key, 'a') + + def test_transformations_from_key_rule_key(self): + rule = core.Rule({'w': ['*', '(']}) + trans_list = rule.transformations_from_key('w') + + eq_(len(trans_list), 3) + eq_(type(trans_list[0]), core.AddCharMarkTransformation) + eq_(type(trans_list[1]), core.AddCharMarkTransformation) + eq_(type(trans_list[2]), core.ByPassTransformation) + + def test_transformations_from_key_non_rule_key(self): + rule = core.Rule({'w': ['*', '(']}) + trans_list = rule.transformations_from_key('a') + + eq_(len(trans_list), 1) + eq_(type(trans_list[0]), core.ByPassTransformation) + + +class TestBoGo: + def test_add_key_add_char(self): + b = core.BoGo(core.Rule({})) + b.add_key('a') + + eq_(b.result(), 'a') + eq_(b.raw_string(), 'a') + + def test_add_key_add_mark(self): + b = core.BoGo(core.Rule({'a': '^'})) + b.add_key('a') + b.add_key('a') + + eq_(b.result(), 'â') + eq_(b.raw_string(), 'aa') + + def test_add_key_add_tone(self): + b = core.BoGo(core.Rule({'s': '/'})) + b.add_key('a') + b.add_key('s') + + eq_(b.result(), 'á') + eq_(b.raw_string(), 'as') + + +class TestProcessSequence: + def test_normal_typing(self): + eq_(core.process_sequence('as'), 'á') diff --git a/bogo/test/test_engine.py b/bogo/test/test_engine.py deleted file mode 100644 index 35b871f..0000000 --- a/bogo/test/test_engine.py +++ /dev/null @@ -1,191 +0,0 @@ -# -*- coding: utf-8 -*- - -from __future__ import unicode_literals -from nose.tools import eq_ -from nose.plugins.attrib import attr -from functools import partial -import codecs - -from bogo.core import _Action, _get_action, process_sequence -from bogo.mark import Mark -import os - - -process_key_no_skip = partial(process_sequence, skip_non_vietnamese=False) - - -class TestHelpers(): - def test_transform(self): - pass - - def test__get_action(self): - # Add mark - eq_(_get_action('a^'), (_Action.ADD_MARK, Mark.HAT)) - eq_(_get_action('a+'), (_Action.ADD_MARK, Mark.BREVE)) - eq_(_get_action('o*'), (_Action.ADD_MARK, Mark.HORN)) - eq_(_get_action('d-'), (_Action.ADD_MARK, Mark.BAR)) - - def test_get_transformation_list(self): - pass - - def test_can_undo(self): - pass - - def test_reverse(self): - pass - - -class TestProcessSeq(): - def test_normal_typing(self): - eq_(process_sequence('v'), 'v') - eq_(process_sequence('aw'), 'ă') - eq_(process_sequence('w'), 'ư') - eq_(process_sequence('ow'), 'ơ') - eq_(process_sequence('oo'), 'ô') - eq_(process_sequence('Oo'), 'Ô') - eq_(process_sequence('dd'), 'đ') - eq_(process_sequence('muaf'), 'mùa') - eq_(process_sequence('Doongd'), 'Đông') - eq_(process_sequence('gif'), 'gì') - eq_(process_sequence('loAnj'), 'loẠn') - eq_(process_sequence('muongw'), 'mương') - eq_(process_sequence('qur'), 'qur') - eq_(process_sequence('Tosan'), 'Toán') - eq_(process_sequence('tusnw'), 'tứn') - eq_(process_sequence('dee'), 'dê') - eq_(process_sequence('mowis'), 'mới') - eq_(process_sequence('uwa'), 'ưa') - eq_(process_sequence('uwo'), 'ưo') - eq_(process_sequence('ddx'), 'đx') - eq_(process_sequence('hoacw'), 'hoăc') - eq_(process_sequence('cuooi'), 'cuôi') - - eq_(process_sequence('tooi'), 'tôi') - eq_(process_sequence('chuyeenr'), 'chuyển') - eq_(process_sequence('ddoonjg'), 'động') - eq_(process_sequence('nheechs'), 'nhếch') - - # uơ related - eq_(process_sequence('quowr'), 'quở') - eq_(process_sequence('huow'), 'huơ') - eq_(process_sequence('thuowr'), 'thuở') - eq_(process_sequence('QUOWR'), 'QUỞ') - eq_(process_sequence('HUOW'), 'HUƠ') - eq_(process_sequence('THUOWR'), 'THUỞ') - - # English words - eq_(process_key_no_skip('case'), 'cáe') - eq_(process_key_no_skip('reset'), 'rết') - - @attr('slow') - def test_with_dictionary(self): - def atomic(word, sequence): - eq_(word, process_sequence(sequence)) - - path = os.path.join(os.path.dirname(__file__), 'DauCu.sequences') - with codecs.open(path, "r", "utf-8") as tests: - for test in tests.read().splitlines(): - sequence, word = test.rstrip().split(":") - yield atomic, word, sequence - - def test_bugs_related(self): - # naỳ. - eq_(process_sequence('nayf.'), 'này.') - - # nguời - eq_(process_sequence('nguowif'), 'người') - eq_(process_sequence('nguwowif'), 'người') - - # thươ. - eq_(process_sequence("thuowr."), "thuở.") - - eq_(process_sequence("[["), "[") - eq_(process_sequence("[["), "[") - - # BUG #77 - eq_(process_sequence("ddiemer"), "điểm") - - # BUG #78 - eq_(process_sequence("tuoufw"), "tườu") - - # BUG #79 - eq_(process_sequence("huoswc"), "hước") - - # BUG #81 - eq_(process_sequence("khoefo"), "khoèo") - - # BUG #82 - eq_(process_sequence("uorw"), "uở") - - def test_bug_93(self): - eq_(process_sequence("{{"), "{") - eq_(process_sequence("}}"), "}") - - def test_free_key_position(self): - eq_(process_sequence('toios'), 'tối') - eq_(process_sequence('toois'), 'tối') - eq_(process_sequence('toosi'), 'tối') - - eq_(process_sequence('tuyenre'), 'tuyển') - eq_(process_sequence('tuyener'), 'tuyển') - eq_(process_sequence('tuyeren'), 'tuyển') - eq_(process_sequence('tuyerne'), 'tuyển') - eq_(process_sequence('tuyeern'), 'tuyển') - eq_(process_sequence('tuyeenr'), 'tuyển') - - eq_(process_sequence('tuwrowng'), 'tưởng') - - def test_undo(self): - eq_(process_sequence('aaa'), 'aa') - eq_(process_sequence('aww'), 'aw') - eq_(process_sequence('ass'), 'as') - eq_(process_sequence('aff'), 'af') - eq_(process_sequence('arr'), 'ar') - eq_(process_sequence('axx'), 'ax') - eq_(process_sequence('ajj'), 'aj') - eq_(process_sequence('uww'), 'uw') - eq_(process_sequence('oww'), 'ow') - - eq_(process_sequence('huww'), 'huw') - eq_(process_sequence('hww'), 'hw') - eq_(process_sequence('ww'), 'w') - eq_(process_sequence('uww'), 'uw') - - eq_(process_sequence('DDd'), 'Dd') - - eq_(process_key_no_skip('Loorngr'), 'Lôngr') - eq_(process_key_no_skip('LOorngr'), 'LÔngr') - eq_(process_key_no_skip('DDoongd'), 'Dôngd') - eq_(process_key_no_skip('DDuowngd'), 'Dươngd') - eq_(process_key_no_skip('Duowngw'), 'Duongw') - - def test_non_vn(self): - def atomic(word): - eq_(process_sequence(word), word) - - tests = [ - "system", - "Virtualbox", - "VMWare", - "Microsoft", - "Google", - "Installation", - "teardown", - "generators", - "event-driven", - "flow" - ] - - for test in tests: - yield atomic, test - - eq_(process_sequence("aans."), "ấn.") - eq_(process_sequence("aans]"), "ấn]") - # eq_(process_sequence("aans.tuongwj"), "ấn.tượng") - eq_(process_sequence("gi[f"), "giờ") - # eq_(process_sequence("taojc"), "taojc") - - def test_with_separator(self): - eq_(process_sequence('con meof dideen'), 'con mèo điên') - eq_(process_sequence('con.meof'), 'con.mèo') - eq_(process_sequence('con?meof'), 'con?mèo') diff --git a/bogo/test/test_mark.py b/bogo/test/test_mark.py index d98dca1..6f403da 100644 --- a/bogo/test/test_mark.py +++ b/bogo/test/test_mark.py @@ -168,10 +168,17 @@ def test_add_mark_at(self): eq_(add_mark_at('e', 0, Mark.HAT), 'ê') def test_add_mark(self): - eq_(add_mark(['d', 'uo', 'ng'], Mark.BAR), ['đ', 'uo', 'ng']) - eq_(add_mark(['d', 'uo', 'ng'], Mark.HORN), ['d', 'ươ', 'ng']) - eq_(add_mark(['d', 'uô', 'ng'], Mark.HORN), ['d', 'ươ', 'ng']) - eq_(add_mark(['d', 'Á', ''], Mark.HAT), ['d', 'Ấ', '']) - eq_(add_mark(['d', '', ''], Mark.BAR), ['đ', '', '']) - eq_(add_mark(['D', 'uo', 'ng'], Mark.BAR), ['Đ', 'uo', 'ng']) - eq_(add_mark(['d', 'e', ''], Mark.HAT), ['d', 'ê', '']) + eq_(add_mark(Syllable('d', 'uo', 'ng'), Mark.BAR), + Syllable('đ', 'uo', 'ng')) + eq_(add_mark(Syllable('d', 'uo', 'ng'), Mark.HORN), + Syllable('d', 'ươ', 'ng')) + eq_(add_mark(Syllable('d', 'uô', 'ng'), Mark.HORN), + Syllable('d', 'ươ', 'ng')) + eq_(add_mark(Syllable('d', 'Á', ''), Mark.HAT), + Syllable('d', 'Ấ', '')) + eq_(add_mark(Syllable('d', '', ''), Mark.BAR), + Syllable('đ', '', '')) + eq_(add_mark(Syllable('D', 'uo', 'ng'), Mark.BAR), + Syllable('Đ', 'uo', 'ng')) + eq_(add_mark(Syllable('d', 'e', ''), Mark.HAT), + Syllable('d', 'ê', '')) diff --git a/bogo/test/test_syllable.py b/bogo/test/test_syllable.py new file mode 100644 index 0000000..15821c6 --- /dev/null +++ b/bogo/test/test_syllable.py @@ -0,0 +1,70 @@ +from nose.tools import eq_ +from bogo.syllable import Syllable + + +class TestSyllable(): + + def test_parse_simple_syllable(self): + parsed = Syllable.new_from_string('tuong') + + expected = Syllable('t', 'uo', 'ng') + eq_(parsed, expected) + + def test_parse_qua(self): + parsed = Syllable.new_from_string('qua') + + expected = Syllable('qu', 'a', '') + eq_(parsed, expected) + + def test_parse_gia(self): + parsed = Syllable.new_from_string('gia') + + expected = Syllable('gi', 'a', '') + eq_(parsed, expected) + + def test_parse_gi(self): + parsed = Syllable.new_from_string('gi') + + expected = Syllable('g', 'i', '') + eq_(parsed, expected) + + def test_parse_rubbish(self): + parsed = Syllable.new_from_string('ohmyfkinggod') + + expected = Syllable('ohmyfkingg', 'o', 'd') + eq_(parsed, expected) + + def test_append_initial_consonant(self): + s = Syllable('c', '', '') + s = s.append_char('c') + + expected = Syllable('cc', '', '') + eq_(s, expected) + + def test_append_initial_consonant_empty(self): + s = Syllable('', '', '') + s = s.append_char('c') + + expected = Syllable('c', '', '') + eq_(s, expected) + + def test_append_vowel(self): + s = Syllable('c', 'a', '') + s = s.append_char('a') + + expected = Syllable('c', 'aa', '') + eq_(s, expected) + + def test_append_vowel_empty(self): + s = Syllable('', '', '') + s = s.append_char('a') + + expected = Syllable('', 'a', '') + eq_(s, expected) + + def test_append_final_consonant(self): + s = Syllable('c', 'a', 'c') + s = s.append_char('c') + + expected = Syllable('c', 'a', 'cc') + eq_(s, expected) \ No newline at end of file diff --git a/bogo/test/test_utils.py b/bogo/test/test_utils.py index a96f6b6..ba5d76d 100644 --- a/bogo/test/test_utils.py +++ b/bogo/test/test_utils.py @@ -47,3 +47,62 @@ def test_separate(): eq_(separate('xẻng'), ['x', 'ẻ', 'ng']) eq_(separate('xoáy'), ['x', 'oáy', '']) eq_(separate('quây'), ['qu', 'ây', '']) + + +class TestKeepCase(): + + def test_keep_lower(self): + + @keep_case + def function(string): + return string.upper() + + eq_(function("abc"), "abc") + + def test_keep_title(self): + + @keep_case + def function(string): + return string.upper() + + eq_(function("Abc"), "Abc") + + def test_keep_upper(self): + + @keep_case + def function(string): + return string.title() + + eq_(function("ABC"), "ABC") + + def test_multiple_arguments(self): + + @keep_case + def function(string, arg1, arg2, kwarg1=True): + return "{} {} {} {}".format(string, arg1, arg2, kwarg1) + + result = function("abc", 1, 2, 3) + expected = "abc 1 2 3" + + eq_(result, expected) + + def test_normalize_case(self): + """ + Test that the string argument is always normalized to lower case. + """ + inner = [0] + + @keep_case + def function(string): + inner[0] = string + return string + + function("ABC") + eq_(inner[0], "abc") + + def test_unrecognized_case(self): + @keep_case + def function(string): + return string + + eq_(function("aBcD"), "abcd") diff --git a/bogo/utils.py b/bogo/utils.py index f3b84cb..d70e661 100644 --- a/bogo/utils.py +++ b/bogo/utils.py @@ -129,3 +129,29 @@ def atomic_separate(string, last_chars, last_is_vowel): comps[1] = comps[1][1:] return comps + + +def keep_case(function): + """ + Decorator to ensure that the letter case of the input and + output of a function stays the same. + + This function assumes that the decorated function takes + a string as the first argument and returns a modified + version of it. Also, the string argument will be normalized + to lower case before being passed to the decorated function. + """ + + def inner(string, *args, **kwargs): + restore_case = { + True: str.__str__, # fallback if the string is empty + string.isupper(): str.upper, + string.islower(): str.lower, + string.istitle(): str.title + }[True] + + modified_string = function(string.lower(), *args, **kwargs) + + return restore_case(modified_string) + + return inner