From 96c96aa167e5fc676146bea12d443a7d85e86254 Mon Sep 17 00:00:00 2001 From: Nick Rimmer Date: Wed, 14 Oct 2015 23:59:38 +0700 Subject: [PATCH] + Levenshtein distance +Tanimoto coefficient --- StringCompare.sln | 28 ++++++ StringCompare.sln.DotSettings | 7 ++ .../Levenshtein/LevenshteinAlgorithm.cs | 50 ++++++++++ .../LevenshteinAlgorithmExtension.cs | 16 +++ .../Algorithms/Tanimoto/TanimotoAlgorithm.cs | 27 ++++++ .../Tanimoto/TanimotoAlgorithmExtension.cs | 16 +++ StringCompare/LICENSE.txt | 21 ++++ StringCompare/Properties/AssemblyInfo.cs | 36 +++++++ StringCompare/StringCompare.csproj | 61 ++++++++++++ .../Interfaces/ICompareAlgorithm.cs | 12 +++ StringCompareTests/LICENSE.txt | 21 ++++ StringCompareTests/MainTests.cs | 92 ++++++++++++++++++ StringCompareTests/Models/DataModel.cs | 38 ++++++++ StringCompareTests/Models/ResultModel.cs | 19 ++++ StringCompareTests/Properties/AssemblyInfo.cs | 36 +++++++ StringCompareTests/StringCompareTests.csproj | 97 +++++++++++++++++++ 16 files changed, 577 insertions(+) create mode 100644 StringCompare.sln create mode 100644 StringCompare.sln.DotSettings create mode 100644 StringCompare/Algorithms/Levenshtein/LevenshteinAlgorithm.cs create mode 100644 StringCompare/Algorithms/Levenshtein/LevenshteinAlgorithmExtension.cs create mode 100644 StringCompare/Algorithms/Tanimoto/TanimotoAlgorithm.cs create mode 100644 StringCompare/Algorithms/Tanimoto/TanimotoAlgorithmExtension.cs create mode 100644 StringCompare/LICENSE.txt create mode 100644 StringCompare/Properties/AssemblyInfo.cs create mode 100644 StringCompare/StringCompare.csproj create mode 100644 StringCompare/Structures/Interfaces/ICompareAlgorithm.cs create mode 100644 StringCompareTests/LICENSE.txt create mode 100644 StringCompareTests/MainTests.cs create mode 100644 StringCompareTests/Models/DataModel.cs create mode 100644 StringCompareTests/Models/ResultModel.cs create mode 100644 StringCompareTests/Properties/AssemblyInfo.cs create mode 100644 StringCompareTests/StringCompareTests.csproj diff --git a/StringCompare.sln b/StringCompare.sln new file mode 100644 index 0000000..be04d79 --- /dev/null +++ b/StringCompare.sln @@ -0,0 +1,28 @@ + +Microsoft Visual Studio Solution File, Format Version 12.00 +# Visual Studio 14 +VisualStudioVersion = 14.0.23107.0 +MinimumVisualStudioVersion = 10.0.40219.1 +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StringCompare", "StringCompare\StringCompare.csproj", "{0DBA6B3E-A274-4EF6-8818-6F298240FEF5}" +EndProject +Project("{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC}") = "StringCompareTests", "StringCompareTests\StringCompareTests.csproj", "{1DE80C20-3175-4FC8-AEBF-FB148968CE58}" +EndProject +Global + GlobalSection(SolutionConfigurationPlatforms) = preSolution + Debug|Any CPU = Debug|Any CPU + Release|Any CPU = Release|Any CPU + EndGlobalSection + GlobalSection(ProjectConfigurationPlatforms) = postSolution + {0DBA6B3E-A274-4EF6-8818-6F298240FEF5}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {0DBA6B3E-A274-4EF6-8818-6F298240FEF5}.Debug|Any CPU.Build.0 = Debug|Any CPU + {0DBA6B3E-A274-4EF6-8818-6F298240FEF5}.Release|Any CPU.ActiveCfg = Release|Any CPU + {0DBA6B3E-A274-4EF6-8818-6F298240FEF5}.Release|Any CPU.Build.0 = Release|Any CPU + {1DE80C20-3175-4FC8-AEBF-FB148968CE58}.Debug|Any CPU.ActiveCfg = Debug|Any CPU + {1DE80C20-3175-4FC8-AEBF-FB148968CE58}.Debug|Any CPU.Build.0 = Debug|Any CPU + {1DE80C20-3175-4FC8-AEBF-FB148968CE58}.Release|Any CPU.ActiveCfg = Release|Any CPU + {1DE80C20-3175-4FC8-AEBF-FB148968CE58}.Release|Any CPU.Build.0 = Release|Any CPU + EndGlobalSection + GlobalSection(SolutionProperties) = preSolution + HideSolutionNode = FALSE + EndGlobalSection +EndGlobal diff --git a/StringCompare.sln.DotSettings b/StringCompare.sln.DotSettings new file mode 100644 index 0000000..8aa6183 --- /dev/null +++ b/StringCompare.sln.DotSettings @@ -0,0 +1,7 @@ + + <?xml version="1.0" encoding="utf-16"?><Profile name="add headers"><CSUpdateFileHeader>True</CSUpdateFileHeader></Profile> + Library for compare strings +Copyright (C) $CURRENT_YEAR$ Nick Rimmer. Contacts: <xan@dipteam.com> + +This file is part of StringCompare library. +Licensed under the MIT License (MIT) \ No newline at end of file diff --git a/StringCompare/Algorithms/Levenshtein/LevenshteinAlgorithm.cs b/StringCompare/Algorithms/Levenshtein/LevenshteinAlgorithm.cs new file mode 100644 index 0000000..6b99220 --- /dev/null +++ b/StringCompare/Algorithms/Levenshtein/LevenshteinAlgorithm.cs @@ -0,0 +1,50 @@ +// Library for compare strings +// Copyright (C) 2015 Nick Rimmer. Contacts: +// +// This file is part of StringCompare library. +// Licensed under the MIT License (MIT) + +using System; +using StringCompare.Structures.Interfaces; + +namespace StringCompare.Algorithms.Levenshtein +{ + public class LevenshteinAlgorithm:ICompareAlgorithm + { + public double GetCompareResult(string source, string target) + { + + if (string.IsNullOrEmpty(source)) + return string.IsNullOrEmpty(target) ? 0 : target.Length; + + if (string.IsNullOrEmpty(target)) + return string.IsNullOrEmpty(source) ? 0 : source.Length; + + var sourceLength = source.Length; + var targetLength = target.Length; + + var distance = new int[sourceLength + 1, targetLength + 1]; + + source = source.ToLowerInvariant().Trim(); + target = target.ToLowerInvariant().Trim(); + + for (var i = 0; i <= sourceLength; distance[i, 0] = i++) ; + for (var j = 0; j <= targetLength; distance[0, j] = j++) ; + + for (var i = 1; i <= sourceLength; i++) + { + for (var j = 1; j <= targetLength; j++) + { + var cost = (target[j - 1] == source[i - 1]) ? 0 : 1; + distance[i, j] = Math.Min(Math.Min(distance[i - 1, j] + 1, distance[i, j - 1] + 1), distance[i - 1, j - 1] + cost); + } + } + + //return distance[sourceLength, targetLength]; + + double stepsToSame = distance[sourceLength, targetLength]; + return (1.0 - (stepsToSame / (double)Math.Max(source.Length, target.Length))); + + } + } +} \ No newline at end of file diff --git a/StringCompare/Algorithms/Levenshtein/LevenshteinAlgorithmExtension.cs b/StringCompare/Algorithms/Levenshtein/LevenshteinAlgorithmExtension.cs new file mode 100644 index 0000000..e3b088f --- /dev/null +++ b/StringCompare/Algorithms/Levenshtein/LevenshteinAlgorithmExtension.cs @@ -0,0 +1,16 @@ +// Library for compare strings +// Copyright (C) 2015 Nick Rimmer. Contacts: +// +// This file is part of StringCompare library. +// Licensed under the MIT License (MIT) + +namespace StringCompare.Algorithms.Levenshtein +{ + public static class LevenshteinAlgorithmExtension + { + public static double CompareLevenshtein(this string source, string target) + { + return new LevenshteinAlgorithm().GetCompareResult(source, target); + } + } +} \ No newline at end of file diff --git a/StringCompare/Algorithms/Tanimoto/TanimotoAlgorithm.cs b/StringCompare/Algorithms/Tanimoto/TanimotoAlgorithm.cs new file mode 100644 index 0000000..71e878d --- /dev/null +++ b/StringCompare/Algorithms/Tanimoto/TanimotoAlgorithm.cs @@ -0,0 +1,27 @@ +// Library for compare strings +// Copyright (C) 2015 Nick Rimmer. Contacts: +// +// This file is part of StringCompare library. +// Licensed under the MIT License (MIT) + +using StringCompare.Structures.Interfaces; + +namespace StringCompare.Algorithms.Tanimoto +{ + public class TanimotoAlgorithm: ICompareAlgorithm + { + public double GetCompareResult(string source, string target) + { + double sourceLength = source.Length; + double targetLength = target.Length; + + double commonsCount = 0; + foreach (var sourceSymbol in source.ToLowerInvariant().Trim()) + { + if (target.ToLowerInvariant().Trim().IndexOf(sourceSymbol) != -1) commonsCount += 1; + } + + return commonsCount / (sourceLength + targetLength - commonsCount); + } + } +} \ No newline at end of file diff --git a/StringCompare/Algorithms/Tanimoto/TanimotoAlgorithmExtension.cs b/StringCompare/Algorithms/Tanimoto/TanimotoAlgorithmExtension.cs new file mode 100644 index 0000000..0390960 --- /dev/null +++ b/StringCompare/Algorithms/Tanimoto/TanimotoAlgorithmExtension.cs @@ -0,0 +1,16 @@ +// Library for compare strings +// Copyright (C) 2015 Nick Rimmer. Contacts: +// +// This file is part of StringCompare library. +// Licensed under the MIT License (MIT) + +namespace StringCompare.Algorithms.Tanimoto +{ + public static class TanimotoAlgorithmExtension + { + public static double CompareTanimoto(this string source, string target) + { + return new TanimotoAlgorithm().GetCompareResult(source, target); + } + } +} \ No newline at end of file diff --git a/StringCompare/LICENSE.txt b/StringCompare/LICENSE.txt new file mode 100644 index 0000000..f17c8e4 --- /dev/null +++ b/StringCompare/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Nick Rimmer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/StringCompare/Properties/AssemblyInfo.cs b/StringCompare/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..0419be5 --- /dev/null +++ b/StringCompare/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("StringCompare")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("Nick Rimmer")] +[assembly: AssemblyProduct("StringCompare")] +[assembly: AssemblyCopyright("Copyright (C) 2015 Nick Rimmer. Contacts: ")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("0dba6b3e-a274-4ef6-8818-6f298240fef5")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/StringCompare/StringCompare.csproj b/StringCompare/StringCompare.csproj new file mode 100644 index 0000000..ec8f16f --- /dev/null +++ b/StringCompare/StringCompare.csproj @@ -0,0 +1,61 @@ + + + + + Debug + AnyCPU + {0DBA6B3E-A274-4EF6-8818-6F298240FEF5} + Library + Properties + StringCompare + StringCompare + v4.0 + 512 + + + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + none + true + bin\Release\ + TRACE + prompt + 4 + bin\Release\StringCompare.XML + + + + + + + + + + + + + + + + + + + + + + + + \ No newline at end of file diff --git a/StringCompare/Structures/Interfaces/ICompareAlgorithm.cs b/StringCompare/Structures/Interfaces/ICompareAlgorithm.cs new file mode 100644 index 0000000..b6ff41e --- /dev/null +++ b/StringCompare/Structures/Interfaces/ICompareAlgorithm.cs @@ -0,0 +1,12 @@ +// Library for compare strings +// Copyright (C) 2015 Nick Rimmer. Contacts: +// +// This file is part of StringCompare library. +// Licensed under the MIT License (MIT) +namespace StringCompare.Structures.Interfaces +{ + public interface ICompareAlgorithm + { + double GetCompareResult(string source, string target); + } +} \ No newline at end of file diff --git a/StringCompareTests/LICENSE.txt b/StringCompareTests/LICENSE.txt new file mode 100644 index 0000000..f17c8e4 --- /dev/null +++ b/StringCompareTests/LICENSE.txt @@ -0,0 +1,21 @@ +The MIT License (MIT) + +Copyright (c) 2015 Nick Rimmer + +Permission is hereby granted, free of charge, to any person obtaining a copy +of this software and associated documentation files (the "Software"), to deal +in the Software without restriction, including without limitation the rights +to use, copy, modify, merge, publish, distribute, sublicense, and/or sell +copies of the Software, and to permit persons to whom the Software is +furnished to do so, subject to the following conditions: + +The above copyright notice and this permission notice shall be included in all +copies or substantial portions of the Software. + +THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR +IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY, +FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE +AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER +LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM, +OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN THE +SOFTWARE. \ No newline at end of file diff --git a/StringCompareTests/MainTests.cs b/StringCompareTests/MainTests.cs new file mode 100644 index 0000000..dfd8f38 --- /dev/null +++ b/StringCompareTests/MainTests.cs @@ -0,0 +1,92 @@ +// Library for compare strings +// Copyright (C) 2015 Nick Rimmer. Contacts: +// +// This file is part of StringCompare library. +// Licensed under the MIT License (MIT) + +using System; +using System.Collections.Generic; +using System.Linq; +using Microsoft.VisualStudio.TestTools.UnitTesting; +using StringCompare.Algorithms.Levenshtein; +using StringCompare.Algorithms.Tanimoto; +using StringCompare.Structures.Interfaces; +using StringCompareTests.Models; + +namespace StringCompareTests +{ + [TestClass] + public class MainTests + { + /// + /// Наборы данных для тестов + /// + private List _datas = new List + { + new DataModel("Проверка опечатков", "Этот белый шарик") + .AddTarget("Это белий шарек") // 3 опечатки + .AddTarget("Этот белий шарек") // 2 опечатки + .AddTarget("Этот белий шарик") // 1 опечатка + .AddTarget("Этот белый шарик") // такой же + .AddTarget("левый текст") // левый текст + , + new DataModel("Проверка лишних слов", "крутой спуск") + .AddTarget("слишком крутой спуск") + .AddTarget("не слишком крутой спуск") + .AddTarget("крутой спуск спереди") + .AddTarget("левый текст") + .AddTarget("крутой спуск") + , + new DataModel("Проверка с повтором", "солнцестояние") + .AddTarget("солнцестояние солнцестояние") + .AddTarget("солнцестояние солнцестояние солнцестояние") + .AddTarget("левый текст") + .AddTarget("солнцестояние") + + , + new DataModel("Проверка с обратным повтором", "солнцестояние солнцестояние") + .AddTarget("солнцестояние солнцестояние") + .AddTarget("солнцестояние солнцестояние солнцестояние") + .AddTarget("левый текст") + .AddTarget("солнцестояние") + + , + new DataModel("Проверка в разных местах", "слово") + .AddTarget("первым было слово") + .AddTarget("первым слово было") + .AddTarget("слово первым было") + .AddTarget("слово") + }; + + private List _algorithms = new List + { + new TanimotoAlgorithm(), + new LevenshteinAlgorithm() + }; + + [TestMethod] + public void Main_Test1() + { + foreach (var algorithm in _algorithms) + { + Console.WriteLine("\n\nАлгоритм - {0}", algorithm.GetType().Name); + foreach (var data in _datas) + { + Console.WriteLine("\nТест \"{0}\" ({1}):", data.DataName, data.Source); + + var results = data + .Targets + .Select(x => new ResultModel + { + Target = x, + CompareValue = algorithm.GetCompareResult(data.Source.ToLower(), x) + }) + .OrderByDescending(x => x.CompareValue); + + foreach (var result in results) + Console.WriteLine(result); + } + } + } + } +} \ No newline at end of file diff --git a/StringCompareTests/Models/DataModel.cs b/StringCompareTests/Models/DataModel.cs new file mode 100644 index 0000000..e36cef1 --- /dev/null +++ b/StringCompareTests/Models/DataModel.cs @@ -0,0 +1,38 @@ +// Library for compare strings +// Copyright (C) 2015 Nick Rimmer. Contacts: +// +// This file is part of StringCompare library. +// Licensed under the MIT License (MIT) + +using System.Collections.Generic; + +namespace StringCompareTests.Models +{ + public class DataModel + { + public string Source { get; set; } + public List Targets { get; set; } + + public string DataName { get; set; } + + public DataModel(string dataName, string source) + { + DataName = dataName; + Source = source; + Targets = new List(); + } + + + public DataModel SetSource(string source) + { + Source = source; + return this; + } + + public DataModel AddTarget(string target) + { + Targets.Add(target); + return this; + } + } +} \ No newline at end of file diff --git a/StringCompareTests/Models/ResultModel.cs b/StringCompareTests/Models/ResultModel.cs new file mode 100644 index 0000000..927cd13 --- /dev/null +++ b/StringCompareTests/Models/ResultModel.cs @@ -0,0 +1,19 @@ +// Library for compare strings +// Copyright (C) 2015 Nick Rimmer. Contacts: +// +// This file is part of StringCompare library. +// Licensed under the MIT License (MIT) +namespace StringCompareTests.Models +{ + public class ResultModel + { + public double CompareValue { get; set; } + + public string Target { get; set; } + + public override string ToString() + { + return string.Format("{0} - \"{1}\"", CompareValue, Target); + } + } +} \ No newline at end of file diff --git a/StringCompareTests/Properties/AssemblyInfo.cs b/StringCompareTests/Properties/AssemblyInfo.cs new file mode 100644 index 0000000..00f8bee --- /dev/null +++ b/StringCompareTests/Properties/AssemblyInfo.cs @@ -0,0 +1,36 @@ +using System.Reflection; +using System.Runtime.CompilerServices; +using System.Runtime.InteropServices; + +// General Information about an assembly is controlled through the following +// set of attributes. Change these attribute values to modify the information +// associated with an assembly. +[assembly: AssemblyTitle("StringCompareTests")] +[assembly: AssemblyDescription("")] +[assembly: AssemblyConfiguration("")] +[assembly: AssemblyCompany("Nick Rimmer")] +[assembly: AssemblyProduct("StringCompareTests")] +[assembly: AssemblyCopyright("Copyright (C) 2015 Nick Rimmer. Contacts: ")] +[assembly: AssemblyTrademark("")] +[assembly: AssemblyCulture("")] + +// Setting ComVisible to false makes the types in this assembly not visible +// to COM components. If you need to access a type in this assembly from +// COM, set the ComVisible attribute to true on that type. +[assembly: ComVisible(false)] + +// The following GUID is for the ID of the typelib if this project is exposed to COM +[assembly: Guid("1de80c20-3175-4fc8-aebf-fb148968ce58")] + +// Version information for an assembly consists of the following four values: +// +// Major Version +// Minor Version +// Build Number +// Revision +// +// You can specify all the values or you can default the Build and Revision Numbers +// by using the '*' as shown below: +// [assembly: AssemblyVersion("1.0.*")] +[assembly: AssemblyVersion("1.0.0.0")] +[assembly: AssemblyFileVersion("1.0.0.0")] diff --git a/StringCompareTests/StringCompareTests.csproj b/StringCompareTests/StringCompareTests.csproj new file mode 100644 index 0000000..faf9b4b --- /dev/null +++ b/StringCompareTests/StringCompareTests.csproj @@ -0,0 +1,97 @@ + + + + Debug + AnyCPU + {1DE80C20-3175-4FC8-AEBF-FB148968CE58} + Library + Properties + StringCompareTests + StringCompareTests + v4.0 + 512 + {3AC096D0-A1C2-E12C-1390-A8335801FDAB};{FAE04EC0-301F-11D3-BF4B-00C04F79EFBC} + 10.0 + $(MSBuildExtensionsPath32)\Microsoft\VisualStudio\v$(VisualStudioVersion) + $(ProgramFiles)\Common Files\microsoft shared\VSTT\$(VisualStudioVersion)\UITestExtensionPackages + False + UnitTest + + + true + full + false + bin\Debug\ + DEBUG;TRACE + prompt + 4 + + + pdbonly + true + bin\Release\ + TRACE + prompt + 4 + + + + + 3.5 + + + + + + + + + + + + + + + + + + + + + + + + + + {0dba6b3e-a274-4ef6-8818-6f298240fef5} + StringCompare + + + + + + + False + + + False + + + False + + + False + + + + + + + + \ No newline at end of file