Viewing File: /home/ubuntu/.local/lib/python3.10/site-packages/distance/_fastcomp.py

# -*- coding: utf-8 -*-

def fast_comp(seq1, seq2, transpositions=False):
	"""Compute the distance between the two sequences `seq1` and `seq2` up to a
	maximum of 2 included, and return it. If the edit distance between the two
	sequences is higher than that, -1 is returned.
	
	If `transpositions` is `True`, transpositions will be taken into account for
	the computation of the distance. This can make a difference, e.g.:

		>>> fast_comp("abc", "bac", transpositions=False)
		2
		>>> fast_comp("abc", "bac", transpositions=True)
		1
	
	This is faster than `levenshtein` by an order of magnitude, but on the
	other hand is of limited use.

	The algorithm comes from `http://writingarchives.sakura.ne.jp/fastcomp`.
	I've added transpositions support to the original code.
	"""
	replace, insert, delete = "r", "i", "d"

	L1, L2  = len(seq1), len(seq2)
	if L1 < L2:
		L1, L2 = L2, L1
		seq1, seq2 = seq2, seq1

	ldiff = L1 - L2
	if ldiff == 0:
		models = (insert+delete, delete+insert, replace+replace)
	elif ldiff == 1:
		models = (delete+replace, replace+delete)
	elif ldiff == 2:
		models = (delete+delete,)
	else:
		return -1

	res = 3
	for model in models:
		i = j = c = 0
		while (i < L1) and (j < L2):
			if seq1[i] != seq2[j]:
				c = c+1
				if 2 < c:
					break
            
				if transpositions and ldiff != 2 \
            	and i < L1 - 1 and j < L2 - 1 \
            	and seq1[i+1] == seq2[j] and seq1[i] == seq2[j+1]:
					i, j = i+2, j+2
				else:
					cmd = model[c-1]
					if cmd == delete:
						i = i+1
					elif cmd == insert:
						j = j+1
					else:
						assert cmd == replace
						i,j = i+1, j+1
			else:
				i,j = i+1, j+1

		if 2 < c:
			continue
		elif i < L1:
			if L1-i <= model[c:].count(delete):
				c = c + (L1-i)
			else:
				continue
		elif j < L2:
			if L2-j <= model[c:].count(insert):
				c = c + (L2-j)
			else:
				continue

		if c < res:
			res = c

	if res == 3:
		res = -1
	return res
Back to Directory File Manager