比较字符串的近似程度
{
Compares two strings in percent (how they are similar to each other)
Returns byte value from 0 to 100%
examples:
var
Percent: byte;
begin
Percent := CompareStringsInPercent('this is a test', 'This is another test'); // 37%
Percent := CompareStringsInPercent('this is some string', 'and yet another some string'); // 24%
Percent := CompareStringsInPercent('abcde', 'fghij'); // 0%
Percent := CompareStringsInPercent('1.jpg', '2.jpg'); // 81%
...
}
function CompareStringsInPercent(Str1, Str2: string): Byte;
type
TLink = array[0..1] of Byte;
var
tmpPattern: TLink;
PatternA, PatternB: array of TLink;
IndexA, IndexB, LengthStr: Integer;
begin
Result := 100;
// Building pattern tables
LengthStr := Max(Length(Str1), Length(Str2));
for IndexA := 1 to LengthStr do
begin
if Length(Str1) >= IndexA then
begin
SetLength(PatternA, (Length(PatternA) + 1));
PatternA[Length(PatternA) - 1][0] := Byte(Str1[IndexA]);
PatternA[Length(PatternA) - 1][1] := IndexA;
end;
if Length(Str2) >= IndexA then
begin
SetLength(PatternB, (Length(PatternB) + 1));
PatternB[Length(PatternB) - 1][0] := Byte(Str2[IndexA]);
PatternB[Length(PatternB) - 1][1] := IndexA;
end;
end;
// Quick Sort of pattern tables
IndexA := 0;
IndexB := 0;
while ((IndexA < (Length(PatternA) - 1)) and (IndexB < (Length(PatternB) - 1))) do
begin
if Length(PatternA) > IndexA then
begin
if PatternA[IndexA][0] < PatternA[IndexA + 1][0] then
begin
tmpPattern[0] := PatternA[IndexA][0];
tmpPattern[1] := PatternA[IndexA][1];
PatternA[IndexA][0] := PatternA[IndexA + 1][0];
PatternA[IndexA][1] := PatternA[IndexA + 1][1];
PatternA[IndexA + 1][0] := tmpPattern[0];
PatternA[IndexA + 1][1] := tmpPattern[1];
if IndexA > 0 then Dec(IndexA);
end
else
Inc(IndexA);
end;
if Length(PatternB) > IndexB then
begin
if PatternB[IndexB][0] < PatternB[IndexB + 1][0] then
begin
tmpPattern[0] := PatternB[IndexB][0];
tmpPattern[1] := PatternB[IndexB][1];
PatternB[IndexB][0] := PatternB[IndexB + 1][0];
PatternB[IndexB][1] := PatternB[IndexB + 1][1];
PatternB[IndexB + 1][0] := tmpPattern[0];
PatternB[IndexB + 1][1] := tmpPattern[1];
if IndexB > 0 then Dec(IndexB);
end
else
Inc(IndexB);
end;
end;
// Calculating simularity percentage
LengthStr := Min(Length(PatternA), Length(PatternB));
for IndexA := 0 to (LengthStr - 1) do
begin
if PatternA[IndexA][0] = PatternB[IndexA][0] then
begin
if Max(PatternA[IndexA][1], PatternB[IndexA][1]) - Min(PatternA[IndexA][1],
PatternB[IndexA][1]) > 0 then Dec(Result,
((100 div LengthStr) div (Max(PatternA[IndexA][1], PatternB[IndexA][1]) -
Min(PatternA[IndexA][1], PatternB[IndexA][1]))))
else if Result < 100 then Inc(Result);
end
else
Dec(Result, (100 div LengthStr))
end;
SetLength(PatternA, 0);
SetLength(PatternB, 0);
end;
---------------------------------------
function Max(i1,i2:Integer):Integer;
begin
if i1>=i2 then
Result:=i1
else
Result:=i2;
end;
function StrSimilar (s1, s2: string; tolerant: Boolean): Integer;
var hit: Integer; // Number of identical chars
p1, p2: Integer; // Position count
l1, l2, l: Integer; // Length of strings
diff: Integer; // unsharp factor
hstr: string; // help var for swapping strings
// Array shows if position is already tested
test: Classes.TBits;
function CompChar (ch1, ch2: Char): Boolean;
// german "umlauts" and similar charactes
begin
if tolerant then begin
ch1:= UpCase (ch1); // compare case insensitive
ch2:= UpCase (ch2);
case ch1 of
'A', 'E': Result:= ch2 in ['A', 'E'];
'B', 'P': Result:= ch2 in ['B', 'P'];
'C', 'Z': Result:= ch2 in ['C', 'Z'];
'D', 'T': Result:= ch2 in ['D', 'T'];
'F', 'V': Result:= ch2 in ['F', 'V'];
'G', 'K': Result:= ch2 in ['G', 'K'];
'S': Result:= ch2 in ['S'];
'I', 'J',
'Y': Result:= ch2 in ['I', 'J', 'Y'];
else Result:= ch1 = ch2;
end;
end else begin
Result:= ch1 = ch2;
end;
end;
begin
l1:= Length (s1);
l2:= Length (s2);
if (l1 <= 0) or (l2 <= 0) then begin Result:= 0; Exit; end;
// Test Length and swap, if s1 is smaller
if l1 < l2 then begin
hstr:= s2; s2:= s1; s1:= hstr;
l:= l2; l2:= l1; l1:= l;
end;
p1:= 1; p2:= 1; hit:= 0;
// calc the unsharp factor depending on
// the length of the strings
diff:= Max (l1, l2) div 3 + ABS (l1 - l2);
// init the test array
test:= Classes.TBits.Create;
// Calc size of TBits. Must be two bigger, because we're 0-Based
// counting from 1, and we need one more then stringlength
test.Size:= l1 + 2;
// loop through the string
repeat
// position tested?
if not test.Bits[p1] then begin
// found a matching character?
if CompChar (s1[p1], s2[p2]) and
(ABS(p1-p2) <= diff) then begin
test.Bits[p1]:= True;
Inc (hit); // increment the hit count
// next positions
Inc (p1); Inc (p2);
if p1 > l1 then p1:= 1;
end else begin
// Set test array
test.Bits[p1]:= False;
Inc (p1);
// Loop back to next test position
if p1 > l1 then begin
while (p1 > 1) and not (test[p1]) do Dec (p1);
Inc (p2)
end;
end;
end else begin
Inc (p1);
// Loop back to next test position
if p1 > l1 then begin
repeat Dec (p1); until (p1 = 1) or test.Bits[p1];
Inc (p2);
end;
end;
until p2 > l2;
test.Free; // Release Booleanlist
// calc procentual value
Result:= 100 * hit DIV l1;
end;
procedure TForm1.Button1Click(Sender: TObject);
begin
Caption:=IntToStr(StrSimilar(Edit1.Text,Edit2.Text,true));
end;
eg:
Edit1.Text is 福建省厦门市中山路11号
if Edit2.Text is 福建厦门中山路11号 -> 81
if Edit2.Text is 福建厦门中山路11 -> 72
if Edit2.Text is 福建厦门中山路 -> 63
if Edit2.Text is 厦门市福建省中山路11号 -> 59
原算法的作者为 Peter Hellinger 24. May 2000
---------------------------------------
'John' and 'John' = 100%
'John' and 'Jon' = 75%
'Jim' and 'James' = 40%
"Luke Skywalker" and 'Darth Vader' = 0%
function StrSimilar (s1, s2: string): Integer;
var hit: Integer; // Number of identical chars
p1, p2: Integer; // Position count
l1, l2: Integer; // Length of strings
pt: Integer; // for counter
diff: Integer; // unsharp factor
hstr: string; // help var for swapping strings
// Array shows is position is already tested
test: array [1..255] of Boolean;
begin
// Test Length and swap, if s1 is smaller
// we alway search along the longer string
if Length(s1) < Length(s2) then begin
hstr:= s2; s2:= s1; s1:= hstr;
end;
// store length of strings to speed up the function
l1:= Length (s1);
l2:= Length (s2);
p1:= 1; p2:= 1; hit:= 0;
// calc the unsharp factor depending on the length
// of the strings. Its about a third of the length
diff:= Max (l1, l2) div 3 + ABS (l1 - l2);
// init the test array
for pt:= 1 to l1 do test[pt]:= False;
// loop through the string
repeat
// position tested?
if not test[p1] then begin
// found a matching character?
if (s1[p1] = s2[p2]) and (ABS(p1-p2) <= diff) then begin
test[p1]:= True;
Inc (hit); // increment the hit count
// next positions
Inc (p1); Inc (p2);
if p1 > l1 then p1:= 1;
end else begin
// Set test array
test[p1]:= False;
Inc (p1);
// Loop back to next test position if end of the string
if p1 > l1 then begin
while (p1 > 1) and not (test[p1]) do Dec (p1);
Inc (p2)
end;
end;
end else begin
Inc (p1);
// Loop back to next test position if end of string
if p1 > l1 then begin
repeat Dec (p1); until (p1 = 1) or test[p1];
Inc (p2);
end;
end;
until p2 > Length(s2);
// calc procentual value
Result:= 100 * hit DIV l1;
end;