# -*- coding: utf-8 -*-

import unicodedata

# from MythTV.utility import levenshtein
   ## see below copy


def normalize_unicode(s):
    """Returns a unicode string in the normalized composition form.
       See
       https://en.wikipedia.org/wiki/Unicode_equivalence
       https://stackoverflow.com/questions/29243962/levenshtein-distance-in-python-wrong-result-with-national-characters
       https://stackoverflow.com/questions/14682397/how-does-unicodedata-normalizeform-unistr-work
    """
    return unicodedata.normalize('NFKC', s)


def levenshtein(s1, s2):
    """Compute the Levenshtein distance of two strings.
    """
    # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance

    if len(s1) < len(s2):
        return levenshtein(s2, s1)
    if not s1:
        return len(s2)

    previous_row = range(len(s2) + 1)
    for i, c1 in enumerate(s1):
        current_row = [i + 1]
        for j, c2 in enumerate(s2):
            insertions = previous_row[j + 1] + 1
            deletions = current_row[j] + 1
            substitutions = previous_row[j] + (c1 != c2)
            current_row.append(min(insertions, deletions, substitutions))
        previous_row = current_row

    return previous_row[-1]


if __name__ == '__main__':

    # Unicode strings may look different, lets normalize them:
    # See https://en.wikipedia.org/wiki/Unicode_equivalence
    # decompose and recompose string containing an `Ä` :
    composed_str   = u"Madam I'm Ädam"
    decomposed_str = unicodedata.normalize('NFKD', composed_str)
    recomposed_str = unicodedata.normalize('NFKC', decomposed_str)

    print(len(composed_str))                               # 14
    print(len(decomposed_str))                             # 15

    print(levenshtein(u"Madam I'm Adam", composed_str))     # 1
    print(levenshtein(u"Madam I'm Adam", decomposed_str))   # 1
    print(levenshtein(composed_str, decomposed_str))        # 2
    print(levenshtein(composed_str, recomposed_str))        # 0


    # check utf-8 encoded strings:
    utf_str1 = u"Madam I'm Ädam".encode('utf-8')
    utf_str2 = u"Madam I'm Adam".encode('utf-8')

    print(len(utf_str1))                                   # 14
    print(len(utf_str2))                                   # 15
    print(levenshtein(utf_str1, utf_str2))                  # 2
    print(levenshtein(utf_str1, utf_str1))                  # 0

