Ticket #13514: levenshtein_normalize_unicode.py

File levenshtein_normalize_unicode.py, 2.4 KB (added by rcrdnalor, 6 years ago)

Usage of levenshtein within python bindings

Line 
1# -*- coding: utf-8 -*-
2
3import unicodedata
4
5# from MythTV.utility import levenshtein
6 ## see below copy
7
8
9def normalize_unicode(s):
10 """Returns a unicode string in the normalized composition form.
11 See
12 https://en.wikipedia.org/wiki/Unicode_equivalence
13 https://stackoverflow.com/questions/29243962/levenshtein-distance-in-python-wrong-result-with-national-characters
14 https://stackoverflow.com/questions/14682397/how-does-unicodedata-normalizeform-unistr-work
15 """
16 return unicodedata.normalize('NFKC', s)
17
18
19def levenshtein(s1, s2):
20 """Compute the Levenshtein distance of two strings.
21 """
22 # http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance
23
24 if len(s1) < len(s2):
25 return levenshtein(s2, s1)
26 if not s1:
27 return len(s2)
28
29 previous_row = range(len(s2) + 1)
30 for i, c1 in enumerate(s1):
31 current_row = [i + 1]
32 for j, c2 in enumerate(s2):
33 insertions = previous_row[j + 1] + 1
34 deletions = current_row[j] + 1
35 substitutions = previous_row[j] + (c1 != c2)
36 current_row.append(min(insertions, deletions, substitutions))
37 previous_row = current_row
38
39 return previous_row[-1]
40
41
42if __name__ == '__main__':
43
44 # Unicode strings may look different, lets normalize them:
45 # See https://en.wikipedia.org/wiki/Unicode_equivalence
46 # decompose and recompose string containing an `Ä` :
47 composed_str = u"Madam I'm Ädam"
48 decomposed_str = unicodedata.normalize('NFKD', composed_str)
49 recomposed_str = unicodedata.normalize('NFKC', decomposed_str)
50
51 print(len(composed_str)) # 14
52 print(len(decomposed_str)) # 15
53
54 print(levenshtein(u"Madam I'm Adam", composed_str)) # 1
55 print(levenshtein(u"Madam I'm Adam", decomposed_str)) # 1
56 print(levenshtein(composed_str, decomposed_str)) # 2
57 print(levenshtein(composed_str, recomposed_str)) # 0
58
59
60 # check utf-8 encoded strings:
61 utf_str1 = u"Madam I'm Ädam".encode('utf-8')
62 utf_str2 = u"Madam I'm Adam".encode('utf-8')
63
64 print(len(utf_str1)) # 14
65 print(len(utf_str2)) # 15
66 print(levenshtein(utf_str1, utf_str2)) # 2
67 print(levenshtein(utf_str1, utf_str1)) # 0
68