| 1 | # -*- coding: utf-8 -*-
|
|---|
| 2 |
|
|---|
| 3 | import unicodedata
|
|---|
| 4 |
|
|---|
| 5 | # from MythTV.utility import levenshtein
|
|---|
| 6 | ## see below copy
|
|---|
| 7 |
|
|---|
| 8 |
|
|---|
| 9 | def normalize_unicode(s):
|
|---|
| 10 | """Returns a unicode string in the normalized composition form.
|
|---|
| 11 | See
|
|---|
| 12 | https://en.wikipedia.org/wiki/Unicode_equivalence
|
|---|
| 13 | https://stackoverflow.com/questions/29243962/levenshtein-distance-in-python-wrong-result-with-national-characters
|
|---|
| 14 | https://stackoverflow.com/questions/14682397/how-does-unicodedata-normalizeform-unistr-work
|
|---|
| 15 | """
|
|---|
| 16 | return unicodedata.normalize('NFKC', s)
|
|---|
| 17 |
|
|---|
| 18 |
|
|---|
| 19 | def levenshtein(s1, s2):
|
|---|
| 20 | """Compute the Levenshtein distance of two strings.
|
|---|
| 21 | """
|
|---|
| 22 | # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance
|
|---|
| 23 |
|
|---|
| 24 | if len(s1) < len(s2):
|
|---|
| 25 | return levenshtein(s2, s1)
|
|---|
| 26 | if not s1:
|
|---|
| 27 | return len(s2)
|
|---|
| 28 |
|
|---|
| 29 | previous_row = range(len(s2) + 1)
|
|---|
| 30 | for i, c1 in enumerate(s1):
|
|---|
| 31 | current_row = [i + 1]
|
|---|
| 32 | for j, c2 in enumerate(s2):
|
|---|
| 33 | insertions = previous_row[j + 1] + 1
|
|---|
| 34 | deletions = current_row[j] + 1
|
|---|
| 35 | substitutions = previous_row[j] + (c1 != c2)
|
|---|
| 36 | current_row.append(min(insertions, deletions, substitutions))
|
|---|
| 37 | previous_row = current_row
|
|---|
| 38 |
|
|---|
| 39 | return previous_row[-1]
|
|---|
| 40 |
|
|---|
| 41 |
|
|---|
| 42 | if __name__ == '__main__':
|
|---|
| 43 |
|
|---|
| 44 | # Unicode strings may look different, lets normalize them:
|
|---|
| 45 | # See https://en.wikipedia.org/wiki/Unicode_equivalence
|
|---|
| 46 | # decompose and recompose string containing an `Ã` :
|
|---|
| 47 | composed_str = u"Madam I'm Ãdam"
|
|---|
| 48 | decomposed_str = unicodedata.normalize('NFKD', composed_str)
|
|---|
| 49 | recomposed_str = unicodedata.normalize('NFKC', decomposed_str)
|
|---|
| 50 |
|
|---|
| 51 | print(len(composed_str)) # 14
|
|---|
| 52 | print(len(decomposed_str)) # 15
|
|---|
| 53 |
|
|---|
| 54 | print(levenshtein(u"Madam I'm Adam", composed_str)) # 1
|
|---|
| 55 | print(levenshtein(u"Madam I'm Adam", decomposed_str)) # 1
|
|---|
| 56 | print(levenshtein(composed_str, decomposed_str)) # 2
|
|---|
| 57 | print(levenshtein(composed_str, recomposed_str)) # 0
|
|---|
| 58 |
|
|---|
| 59 |
|
|---|
| 60 | # check utf-8 encoded strings:
|
|---|
| 61 | utf_str1 = u"Madam I'm Ãdam".encode('utf-8')
|
|---|
| 62 | utf_str2 = u"Madam I'm Adam".encode('utf-8')
|
|---|
| 63 |
|
|---|
| 64 | print(len(utf_str1)) # 14
|
|---|
| 65 | print(len(utf_str2)) # 15
|
|---|
| 66 | print(levenshtein(utf_str1, utf_str2)) # 2
|
|---|
| 67 | print(levenshtein(utf_str1, utf_str1)) # 0
|
|---|
| 68 |
|
|---|