Context Navigation

Back to Ticket #13514

Ticket #13514: levenshtein_normalize_unicode.py

File levenshtein_normalize_unicode.py, 2.4 KB (added by rcrdnalor, 6 years ago)
Usage of levenshtein within python bindings

Line
1	# -- coding: utf-8 --
2
3	import unicodedata
4
5	# from MythTV.utility import levenshtein
6	## see below copy
7
8
9	def normalize_unicode(s):
10	"""Returns a unicode string in the normalized composition form.
11	See
12	https://en.wikipedia.org/wiki/Unicode_equivalence
13	https://stackoverflow.com/questions/29243962/levenshtein-distance-in-python-wrong-result-with-national-characters
14	https://stackoverflow.com/questions/14682397/how-does-unicodedata-normalizeform-unistr-work
15	"""
16	return unicodedata.normalize('NFKC', s)
17
18
19	def levenshtein(s1, s2):
20	"""Compute the Levenshtein distance of two strings.
21	"""
22	# http://en.wikibooks.org/wiki/Algorithm_implementation/Strings/Levenshtein_distance
23
24	if len(s1) < len(s2):
25	return levenshtein(s2, s1)
26	if not s1:
27	return len(s2)
28
29	previous_row = range(len(s2) + 1)
30	for i, c1 in enumerate(s1):
31	current_row = [i + 1]
32	for j, c2 in enumerate(s2):
33	insertions = previous_row[j + 1] + 1
34	deletions = current_row[j] + 1
35	substitutions = previous_row[j] + (c1 != c2)
36	current_row.append(min(insertions, deletions, substitutions))
37	previous_row = current_row
38
39	return previous_row[-1]
40
41
42	if __name__ == '__main__':
43
44	# Unicode strings may look different, lets normalize them:
45	# See https://en.wikipedia.org/wiki/Unicode_equivalence
46	# decompose and recompose string containing an `Ã` :
47	composed_str = u"Madam I'm Ãdam"
48	decomposed_str = unicodedata.normalize('NFKD', composed_str)
49	recomposed_str = unicodedata.normalize('NFKC', decomposed_str)
50
51	print(len(composed_str)) # 14
52	print(len(decomposed_str)) # 15
53
54	print(levenshtein(u"Madam I'm Adam", composed_str)) # 1
55	print(levenshtein(u"Madam I'm Adam", decomposed_str)) # 1
56	print(levenshtein(composed_str, decomposed_str)) # 2
57	print(levenshtein(composed_str, recomposed_str)) # 0
58
59
60	# check utf-8 encoded strings:
61	utf_str1 = u"Madam I'm Ãdam".encode('utf-8')
62	utf_str2 = u"Madam I'm Adam".encode('utf-8')
63
64	print(len(utf_str1)) # 14
65	print(len(utf_str2)) # 15
66	print(levenshtein(utf_str1, utf_str2)) # 2
67	print(levenshtein(utf_str1, utf_str1)) # 0
68

Download in other formats:

Original Format