比较字符串的近似程度

Tags: /超级猛料/String.字符串处理/ Date Created:

比较字符串的近似程度

{

Compares two strings in percent (how they are similar to each other)

Returns byte value from 0 to 100%

examples:

var

Percent: byte;

begin

Percent := CompareStringsInPercent('this is a test', 'This is another test'); // 37%

Percent := CompareStringsInPercent('this is some string', 'and yet another some string'); // 24%

Percent := CompareStringsInPercent('abcde', 'fghij'); // 0%

Percent := CompareStringsInPercent('1.jpg', '2.jpg'); // 81%

...

}

function CompareStringsInPercent(Str1, Str2: string): Byte;

type

TLink = array[0..1] of Byte;

var

tmpPattern: TLink;

PatternA, PatternB: array of TLink;

IndexA, IndexB, LengthStr: Integer;

begin

Result := 100;

// Building pattern tables

LengthStr := Max(Length(Str1), Length(Str2));

for IndexA := 1 to LengthStr do

begin

if Length(Str1) >= IndexA then

begin

SetLength(PatternA, (Length(PatternA) + 1));

PatternA[Length(PatternA) - 1][0] := Byte(Str1[IndexA]);

PatternA[Length(PatternA) - 1][1] := IndexA;

end;

if Length(Str2) >= IndexA then

begin

SetLength(PatternB, (Length(PatternB) + 1));

PatternB[Length(PatternB) - 1][0] := Byte(Str2[IndexA]);

PatternB[Length(PatternB) - 1][1] := IndexA;

end;

// Quick Sort of pattern tables

IndexA := 0;

IndexB := 0;

while ((IndexA < (Length(PatternA) - 1)) and (IndexB < (Length(PatternB) - 1))) do

begin

if Length(PatternA) > IndexA then

begin

if PatternA[IndexA][0] < PatternA[IndexA + 1][0] then

begin

tmpPattern[0] := PatternA[IndexA][0];

tmpPattern[1] := PatternA[IndexA][1];

PatternA[IndexA][0] := PatternA[IndexA + 1][0];

PatternA[IndexA][1] := PatternA[IndexA + 1][1];

PatternA[IndexA + 1][0] := tmpPattern[0];

PatternA[IndexA + 1][1] := tmpPattern[1];

if IndexA > 0 then Dec(IndexA);

end

else

Inc(IndexA);

end;

if Length(PatternB) > IndexB then

begin

if PatternB[IndexB][0] < PatternB[IndexB + 1][0] then

begin

tmpPattern[0] := PatternB[IndexB][0];

tmpPattern[1] := PatternB[IndexB][1];

PatternB[IndexB][0] := PatternB[IndexB + 1][0];

PatternB[IndexB][1] := PatternB[IndexB + 1][1];

PatternB[IndexB + 1][0] := tmpPattern[0];

PatternB[IndexB + 1][1] := tmpPattern[1];

if IndexB > 0 then Dec(IndexB);

end

else

Inc(IndexB);

end;

// Calculating simularity percentage

LengthStr := Min(Length(PatternA), Length(PatternB));

for IndexA := 0 to (LengthStr - 1) do

begin

if PatternA[IndexA][0] = PatternB[IndexA][0] then

begin

if Max(PatternA[IndexA][1], PatternB[IndexA][1]) - Min(PatternA[IndexA][1],

PatternB[IndexA][1]) > 0 then Dec(Result,

((100 div LengthStr) div (Max(PatternA[IndexA][1], PatternB[IndexA][1]) -

Min(PatternA[IndexA][1], PatternB[IndexA][1]))))

else if Result < 100 then Inc(Result);

end

else

Dec(Result, (100 div LengthStr))

end;

SetLength(PatternA, 0);

SetLength(PatternB, 0);

end;

---------------------------------------

function Max(i1,i2:Integer):Integer;

begin

if i1>=i2 then

Result:=i1

else

Result:=i2;

end;

function StrSimilar (s1, s2: string; tolerant: Boolean): Integer;

var hit: Integer; // Number of identical chars

p1, p2: Integer; // Position count

l1, l2, l: Integer; // Length of strings

diff: Integer; // unsharp factor

hstr: string; // help var for swapping strings

// Array shows if position is already tested

test: Classes.TBits;

function CompChar (ch1, ch2: Char): Boolean;

// german "umlauts" and similar charactes

begin

if tolerant then begin

ch1:= UpCase (ch1); // compare case insensitive

ch2:= UpCase (ch2);

case ch1 of

'A', 'E': Result:= ch2 in ['A', 'E'];

'B', 'P': Result:= ch2 in ['B', 'P'];

'C', 'Z': Result:= ch2 in ['C', 'Z'];

'D', 'T': Result:= ch2 in ['D', 'T'];

'F', 'V': Result:= ch2 in ['F', 'V'];

'G', 'K': Result:= ch2 in ['G', 'K'];

'S': Result:= ch2 in ['S'];

'I', 'J',

'Y': Result:= ch2 in ['I', 'J', 'Y'];

else Result:= ch1 = ch2;

end;

end else begin

Result:= ch1 = ch2;

end;

begin

l1:= Length (s1);

l2:= Length (s2);

if (l1 <= 0) or (l2 <= 0) then begin Result:= 0; Exit; end;

// Test Length and swap, if s1 is smaller

if l1 < l2 then begin

hstr:= s2; s2:= s1; s1:= hstr;

l:= l2; l2:= l1; l1:= l;

end;

p1:= 1; p2:= 1; hit:= 0;

// calc the unsharp factor depending on

// the length of the strings

diff:= Max (l1, l2) div 3 + ABS (l1 - l2);

// init the test array

test:= Classes.TBits.Create;

// Calc size of TBits. Must be two bigger, because we're 0-Based

// counting from 1, and we need one more then stringlength

test.Size:= l1 + 2;

// loop through the string

repeat

// position tested?

if not test.Bits[p1] then begin

// found a matching character?

if CompChar (s1[p1], s2[p2]) and

(ABS(p1-p2) <= diff) then begin

test.Bits[p1]:= True;

Inc (hit); // increment the hit count

// next positions

Inc (p1); Inc (p2);

if p1 > l1 then p1:= 1;

end else begin

// Set test array

test.Bits[p1]:= False;

Inc (p1);

// Loop back to next test position

if p1 > l1 then begin

while (p1 > 1) and not (test[p1]) do Dec (p1);

Inc (p2)

end;

end else begin

Inc (p1);

// Loop back to next test position

if p1 > l1 then begin

repeat Dec (p1); until (p1 = 1) or test.Bits[p1];

Inc (p2);

end;

until p2 > l2;

test.Free; // Release Booleanlist

// calc procentual value

Result:= 100 * hit DIV l1;

end;

procedure TForm1.Button1Click(Sender: TObject);

begin

Caption:=IntToStr(StrSimilar(Edit1.Text,Edit2.Text,true));

end;

eg:

Edit1.Text is 福建省厦门市中山路11号

if Edit2.Text is 福建厦门中山路11号 -> 81

if Edit2.Text is 福建厦门中山路11 -> 72

if Edit2.Text is 福建厦门中山路 -> 63

if Edit2.Text is 厦门市福建省中山路11号 -> 59

原算法的作者为 Peter Hellinger 24. May 2000

---------------------------------------

'John' and 'John' = 100%

'John' and 'Jon' = 75%

'Jim' and 'James' = 40%

"Luke Skywalker" and 'Darth Vader' = 0%

function StrSimilar (s1, s2: string): Integer;

var hit: Integer; // Number of identical chars

p1, p2: Integer; // Position count

l1, l2: Integer; // Length of strings

pt: Integer; // for counter

diff: Integer; // unsharp factor

hstr: string; // help var for swapping strings

// Array shows is position is already tested

test: array [1..255] of Boolean;

begin

// Test Length and swap, if s1 is smaller

// we alway search along the longer string

if Length(s1) < Length(s2) then begin

hstr:= s2; s2:= s1; s1:= hstr;

end;

// store length of strings to speed up the function

l1:= Length (s1);

l2:= Length (s2);

p1:= 1; p2:= 1; hit:= 0;

// calc the unsharp factor depending on the length

// of the strings. Its about a third of the length

diff:= Max (l1, l2) div 3 + ABS (l1 - l2);

// init the test array

for pt:= 1 to l1 do test[pt]:= False;

// loop through the string

repeat

// position tested?

if not test[p1] then begin

// found a matching character?

if (s1[p1] = s2[p2]) and (ABS(p1-p2) <= diff) then begin

test[p1]:= True;

Inc (hit); // increment the hit count

// next positions

Inc (p1); Inc (p2);

if p1 > l1 then p1:= 1;

end else begin

// Set test array

test[p1]:= False;

Inc (p1);

// Loop back to next test position if end of the string

if p1 > l1 then begin

while (p1 > 1) and not (test[p1]) do Dec (p1);

Inc (p2)

end;

end else begin

Inc (p1);

// Loop back to next test position if end of string

if p1 > l1 then begin

repeat Dec (p1); until (p1 = 1) or test[p1];

Inc (p2);

end;

until p2 > Length(s2);

// calc procentual value

Result:= 100 * hit DIV l1;

end;