Q: How to find identical and similar string values ​​in different tables in SQL

advertisements

I have two tables in my database, each table has column with names. How to compare these tables columns and how to find these names what has exact matches and these names, which are similar in table1 and table2?

For example:

table1

column1
---------------------
Tom Hawky
Antony Hendric Formen
Cathy Cassy
Bill Gates
Mary Diore

table2

column1
----------------------
Christopher Fridricson
Ken Lovely
Tom Hawky
Anthony Foreman
Chati Cassei

Result should come out with this:

table1 - table2
Tom Hawky - Tom Hawky
Antony Hendric Formen - Anthony Henrich Foreman
Cathy Cassy - Chati Cassei


One solution is to do the following:

1) Split the names into some temporary tables, so that comparison can be done much easier

2) Construct a query that tries to find matches based on some distance

1) Splitting into words

Setup

-- drop table Table1
create table Table1
(
    Id1 INT NOT NULL CONSTRAINT PK_Table1 PRIMARY KEY IDENTITY (1, 1),
    Column1 NVARCHAR(200) NOT NULL
)
GO

insert into Table1 (Column1)
VALUES ('Tom Hawky'), ('Antony Hendric Formen'), ('Cathy Cassy'), ('Bill Gates'), ('Mary Diore')
GO

-- drop table Table2
create table Table2
(
    Id2 INT NOT NULL CONSTRAINT PK_Table2 PRIMARY KEY IDENTITY (1, 1),
    Column2 NVARCHAR(200) NOT NULL
)
GO

insert into Table2 (Column2)
VALUES ('Christopher Fridricson'), ('Ken Lovely'), ('Tom Hawky'), ('Anthony Foreman'), ('Chati Cassei'), ('Tom X')
GO

select * from Table1
GO

select * from Table2
GO

Split function

There are several split functions that can be used and I have chosen the XML way:

CREATE FUNCTION dbo.SplitStrings_XML
(
   @List       NVARCHAR(MAX),
   @Delimiter  NVARCHAR(255)
)
RETURNS TABLE
WITH SCHEMABINDING
AS
   RETURN
   (
      SELECT Item = y.i.value('(./text())[1]', 'nvarchar(4000)')
      FROM
      (
        SELECT x = CONVERT(XML, '<i>'
          + REPLACE(@List, @Delimiter, '</i><i>')
          + '</i>').query('.')
      ) AS a CROSS APPLY x.nodes('i') AS y(i)
   );
GO

Split the words

drop table #T1Words
Go

create table #T1Words (Id1 INT NOT NULL, Word NVARCHAR(100))
GO

insert into #T1Words
select T1.Id1, X1.Item
from Table1 T1
    cross apply dbo.SplitStrings_XML (T1.Column1, N' ') X1
GO

drop table #T2Words
GO 

create table #T2Words (Id2 INT NOT NULL, Word NVARCHAR(200))
GO

insert into #T2Words
select T2.Id2, X2.Item
from Table2 T2
    cross apply dbo.SplitStrings_XML (T2.Column2, N' ') X2
GO

Select using some distance

One distance that can be used is Levenshtein. In order to use it in SQL, it must be implemented in CLR, otherwise it is very slow. Something like this:

[Microsoft.SqlServer.Server.SqlFunction(IsDeterministic = true, IsPrecise = false)]
public static int Levenshtein(SqlString S1, SqlString S2)
{
    if (S1.IsNull)
        S1 = new SqlString("");

    if (S2.IsNull)
        S2 = new SqlString("");

    int maxLen = 4096;

    // keeping only the first part of the string (performance reasons)
    String SC1 = S1.Value.ToUpper();
    String SC2 = S2.Value.ToUpper();

    if (SC1.Length > maxLen)
        SC1 = SC1.Remove(maxLen);
    if (SC2.Length > maxLen)
        SC2 = SC2.Remove(maxLen);

    int n = SC1.Length;
    int m = SC2.Length;

    short[,] d = new short[n + 1, m + 1];
    int cost = 0;

    if (n + m == 0)
    {
        return 0;
    }
    else if (n == 0)
    {
        return 0;
    }
    else if (m == 0)
    {
        return 0;
    }

    for (short i = 0; i <= n; i++)
        d[i, 0] = i;

    for (short j = 0; j <= m; j++)
        d[0, j] = j;

    for (int i = 1; i <= n; i++)
    {
        for (int j = 1; j <= m; j++)
        {
            if (SC1[i - 1] == SC2[j - 1])
                cost = 0;
            else
                cost = 1;

            d[i, j] = (short) System.Math.Min(System.Math.Min(d[i - 1, j] + 1, d[i, j - 1] + 1), d[i - 1, j - 1] + cost);
        }
    }

    // double percentage = System.Math.Round((1.0 - ((double)d[n, m] / (double)System.Math.Max(n, m))) * 100.0, 2);
    // return percentage;
    return d[n, m];
}

-- SQL to actually create scalar function that calls CLR code
ALTER FUNCTION dbo.Levenshtein(@S1 nvarchar(max), @S2 nvarchar(max))
    RETURNS INT as EXTERNAL NAME ClrUtils.StoredFunctions.Levenshtein
GO

-- CLR must be enabled for database
sp_configure 'clr enabled', 1
GO
RECONFIGURE
GO

2) The query

select T1.Id1, T1.Column1, T2.Id2, T2.Column2
from Table1 T1
    cross join Table2 T2
where EXISTS (
    SELECT 1
    FROM #T1Words T1W
        JOIN #T2Words T2W ON [dbo].[Levenshtein](T2W.Word, T1W.Word) < 3   -- T2W.Word = T1W.Word
    WHERE T1W.Id1 = T1.Id1
        AND T2W.Id2 = T2.Id2
)

Of course, you must set your threshold based on the acceptable difference.