Tuesday, March 13, 2018

Delete duplicates in a List of int arrays

Leave a Comment

having a List of int arrays like:

List<int[]> intArrList = new List<int[]>(); intArrList.Add(new int[3] { 0, 0, 0 }); intArrList.Add(new int[5] { 20, 30, 10, 4, 6 });  //this intArrList.Add(new int[3] { 1, 2, 5 }); intArrList.Add(new int[5] { 20, 30, 10, 4, 6 });  //this intArrList.Add(new int[3] { 12, 22, 54 }); intArrList.Add(new int[5] { 1, 2, 6, 7, 8 }); intArrList.Add(new int[4] { 0, 0, 0, 0 }); 

How would you remove duplicates (by duplicate I mean element of list has same length and same numbers).

On the example I would remove element { 20, 30, 10, 4, 6 } because it is found twice

I was thinking on sorting the list by element size, then loop each element against rest but I am not sure how to do that.

Other question would be, if using other structure like a Hash would be better... If so how to use it?

8 Answers

Answers 1

Use GroupBy:

var result = intArrList.GroupBy(c => String.Join(",", c))                        .Select(c => c.First().ToList()).ToList(); 

The result:

{0, 0, 0}

{20, 30, 10, 4, 6}

{1, 2, 5}

{12, 22, 54}

{1, 2, 6, 7, 8}

{0, 0, 0, 0}

EDIT: If you want to consider {1,2,3,4} be equal to {2,3,4,1} you need to use OrderBy like this:

var result = intArrList.GroupBy(p => string.Join(", ", p.OrderBy(c => c)))                        .Select(c => c.First().ToList()).ToList();  

EDIT2: To help understanding how the LINQ GroupBy solution works consider the following method:

public List<int[]> FindDistinctWithoutLinq(List<int[]> lst) {     var dic = new Dictionary<string, int[]>();     foreach (var item in lst)     {         string key = string.Join(",", item.OrderBy(c=>c));          if (!dic.ContainsKey(key))         {             dic.Add(key, item);         }     }      return dic.Values.ToList(); } 

Answers 2

You can define your own implementation of IEqualityComparer and use it together with IEnumerable.Distinct:

class MyComparer : IEqualityComparer<int[]>  {     public int GetHashCode(int[] instance) { return 0; } // TODO: better HashCode for arrays     public bool Equals(int[] instance, int[] other)     {         if (other == null || instance == null || instance.Length != other.Length) return false;          return instance.SequenceEqual(other);     } } 

Now write this to get only distinct values for your list:

var result = intArrList.Distinct(new MyComparer()); 

However if you want different permutations also you should implement your comparer this way:

public bool Equals(int[] instance, int[] other) {     if (ReferenceEquals(instance, other)) return true; // this will return true when both arrays are NULL     if (other == null || instance == null) return false;     return instance.All(x => other.Contains(x)) && other.All(x => instance.Contains(x)); } 

EDIT: For a better GetashCode-implementation you may have a look at this post as also suggested in @Mick´s answer.

Answers 3

Well lifting code from here and here. A more generic implementation of GetHashCode would make this more generic, however I believe the implementation below is the most robust

class Program {     static void Main(string[] args)     {         List<int[]> intArrList = new List<int[]>();         intArrList.Add(new int[3] { 0, 0, 0 });         intArrList.Add(new int[5] { 20, 30, 10, 4, 6 });  //this         intArrList.Add(new int[3] { 1, 2, 5 });         intArrList.Add(new int[5] { 20, 30, 10, 4, 6 });  //this         intArrList.Add(new int[3] { 12, 22, 54 });         intArrList.Add(new int[5] { 1, 2, 6, 7, 8 });         intArrList.Add(new int[4] { 0, 0, 0, 0 });          var test = intArrList.Distinct(new IntArrayEqualityComparer());         Console.WriteLine(test.Count());         Console.WriteLine(intArrList.Count());     }      public class IntArrayEqualityComparer : IEqualityComparer<int[]>     {         public bool Equals(int[] x, int[] y)         {             return ArraysEqual(x, y);         }          public int GetHashCode(int[] obj)         {             int hc = obj.Length;             for (int i = 0; i < obj.Length; ++i)             {                 hc = unchecked(hc * 17 + obj[i]);             }             return hc;         }          static bool ArraysEqual<T>(T[] a1, T[] a2)         {             if (ReferenceEquals(a1, a2))                 return true;              if (a1 == null || a2 == null)                 return false;              if (a1.Length != a2.Length)                 return false;              EqualityComparer<T> comparer = EqualityComparer<T>.Default;             for (int i = 0; i < a1.Length; i++)             {                 if (!comparer.Equals(a1[i], a2[i])) return false;             }             return true;         }     } } 

Edit: a Generic implementation of IEqualityComparer for an arrays of any type:-

public class ArrayEqualityComparer<T> : IEqualityComparer<T[]> {     public bool Equals(T[] x, T[] y)     {         if (ReferenceEquals(x, y))             return true;          if (x == null || y == null)             return false;          if (x.Length != y.Length)             return false;          EqualityComparer<T> comparer = EqualityComparer<T>.Default;         for (int i = 0; i < x.Length; i++)         {             if (!comparer.Equals(x[i], y[i])) return false;         }         return true;     }      public int GetHashCode(T[] obj)     {         int hc = obj.Length;         for (int i = 0; i < obj.Length; ++i)         {             hc = unchecked(hc * 17 + obj[i].GetHashCode());         }         return hc;     } } 

Edit2: If ordering of the integers within the arrays doesn't matter I would

var test = intArrList.Select(a => a.OrderBy(e => e).ToArray()).Distinct(comparer).ToList(); 

Answers 4

List<int[]> CopyString1 = new List<int[]>(); CopyString1.AddRange(intArrList); List<int[]> CopyString2 = new List<int[]>(); CopyString2.AddRange(intArrList); for (int i = 0; i < CopyString2.Count(); i++) {     for (int j = i; j < CopyString1.Count(); j++)     {         if (i != j && CopyString2[i].Count() == CopyString1[j].Count())         {             var cnt = 0;             for (int k = 0; k < CopyString2[i].Count(); k++)             {                 if (CopyString2[i][k] == CopyString1[j][k])                     cnt++;                 else                     break;             }             if (cnt == CopyString2[i].Count())                 intArrList.RemoveAt(i);         }     } } 

Answers 5

Input list;

List<List<int>> initList = new List<List<int>>(); initList.Add(new List<int>{ 0, 0, 0 }); initList.Add(new List<int>{ 20, 30, 10, 4, 6 });  //this initList.Add(new List<int> { 1, 2, 5 }); initList.Add(new List<int> { 20, 30, 10, 4, 6 });  //this initList.Add(new List<int> { 12, 22, 54 }); initList.Add(new List<int> { 1, 2, 6, 7, 8 }); initList.Add(new List<int> { 0, 0, 0, 0 }); 

You can create a result list, and before adding elements you can check if it is already added. I simply compared the list counts and used p.Except(item).Any() call to check if the list contains that element or not.

List<List<int>> returnList = new List<List<int>>();  foreach (var item in initList) {     if (returnList.Where(p => !p.Except(item).Any() && !item.Except(p).Any()                              && p.Count() == item.Count() ).Count() == 0)     returnList.Add(item); } 

Answers 6

You can use a HashSet. HashSet is a collection used for guarantee uniqueness and you can compare items on collection, Intersect, Union. etc.

Pros: No duplicates, easy to manipulate groups of data, more efficient Cons: You can't get a specific item in the collection, for example: list[0] doesn't work for HashSets. You can only Enumerating the items. e.g. foreach

Here is an example:

using System; using System.Collections.Generic;  namespace ConsoleApp2 {     class Program     {         static void Main(string[] args)         {             HashSet<HashSet<int>> intArrList = new HashSet<HashSet<int>>(new HashSetIntComparer());             intArrList.Add(new HashSet<int>(3) { 0, 0, 0 });             intArrList.Add(new HashSet<int>(5) { 20, 30, 10, 4, 6 });  //this             intArrList.Add(new HashSet<int>(3) { 1, 2, 5 });             intArrList.Add(new HashSet<int>(5) { 20, 30, 10, 4, 6 });  //this             intArrList.Add(new HashSet<int>(3) { 12, 22, 54 });             intArrList.Add(new HashSet<int>(5) { 1, 2, 6, 7, 8 });             intArrList.Add(new HashSet<int>(4) { 0, 0, 0, 0 });              // Checking the output             foreach (var item in intArrList)             {                 foreach (var subHasSet in item)                 {                     Console.Write("{0} ", subHasSet);                 }                  Console.WriteLine();             }                          Console.Read();         }          private class HashSetIntComparer : IEqualityComparer<HashSet<int>>         {             public bool Equals(HashSet<int> x, HashSet<int> y)             {                 // SetEquals does't set anything. It's a method for compare the contents of the HashSet.                  // Such a poor name from .Net                 return x.SetEquals(y);             }              public int GetHashCode(HashSet<int> obj)             {                 //TODO: implemente a better HashCode                 return base.GetHashCode();             }         }     } }   Output: 0 20 30 10 4 6 1 2 5 12 22 54 1 2 6 7 8 

Note: Since 0 is repeated several times, HashSet considers the 0 only once. If you need diferentiate between 0 0 0 0 and 0 0 0 then you can replace HashSet<HashSet<int>> for HashSet<List<int>> and implement a Comparer to the List instead.

You can use this link to learn how to compare a list: https://social.msdn.microsoft.com/Forums/en-US/2ff3016c-bd61-4fec-8f8c-7b6c070123fa/c-compare-two-lists-of-objects?forum=csharplanguage

If you want to learn more about Collections and DataTypes this course is a perfect place to learn it: https://app.pluralsight.com/player?course=csharp-collections&author=simon-robinson&name=csharp-collections-fundamentals-m9-sets&clip=1&mode=live

Answers 7

Using MoreLINQ this can be very simple with DistinctBy.

var result = intArrList.DistinctBy(x => string.Join(",", x)); 

Similar to the GroupBy answer if you want distinction to be irrespective of order just order in the join.

var result = intArrList.DistinctBy(x => string.Join(",", x.OrderBy(y => y))); 

EDIT: This is how it's implemented

public static IEnumerable<TSource> DistinctBy<TSource, TKey>(this IEnumerable<TSource> source,             Func<TSource, TKey> keySelector, IEqualityComparer<TKey> comparer)         {             if (source == null) throw new ArgumentNullException(nameof(source));             if (keySelector == null) throw new ArgumentNullException(nameof(keySelector));              return _(); IEnumerable<TSource> _()             {                 var knownKeys = new HashSet<TKey>(comparer);                 foreach (var element in source)                 {                     if (knownKeys.Add(keySelector(element)))                         yield return element;                 }             }         } 

So you if you don't need MoreLINQ for anything else you can just use a method like this:

private static IEnumerable<int[]> GetUniqueArrays(IEnumerable<int[]> source)     {         var knownKeys = new HashSet<string>();         foreach (var element in source)         {             if (knownKeys.Add(string.Join(",", element)))                 yield return element;         }     } 

Answers 8

Perf comparison of @S.Akbari's and @Mick's solutions using BenchmarkDotNet

EDIT:

SAkbari_FindDistinctWithoutLinq has redundant call to ContainsKey, so i added impoved and faster version: SAkbari_FindDistinctWithoutLinq2

                            Method |     Mean |     Error |    StdDev | --------------------------------- |---------:|----------:|----------:|   SAkbari_FindDistinctWithoutLinq | 4.021 us | 0.0723 us | 0.0676 us |  SAkbari_FindDistinctWithoutLinq2 | 3.930 us | 0.0529 us | 0.0495 us |          SAkbari_FindDistinctLinq | 5.597 us | 0.0264 us | 0.0234 us |             Mick_UsingGetHashCode | 6.339 us | 0.0265 us | 0.0248 us | 
 BenchmarkDotNet=v0.10.13, OS=Windows 10 Redstone 3 [1709, Fall Creators Update] (10.0.16299.248) Intel Core i7-7700 CPU 3.60GHz (Kaby Lake), 1 CPU, 8 logical cores and 4 physical cores Frequency=3515625 Hz, Resolution=284.4444 ns, Timer=TSC .NET Core SDK=2.1.100   [Host]     : .NET Core 2.0.5 (CoreCLR 4.6.26020.03, CoreFX 4.6.26018.01), 64bit RyuJIT   DefaultJob : .NET Core 2.0.5 (CoreCLR 4.6.26020.03, CoreFX 4.6.26018.01), 64bit RyuJIT 

Benchmark:

using BenchmarkDotNet.Attributes; using BenchmarkDotNet.Running; using System; using System.Collections.Generic; using System.Linq;  namespace ConsoleApp1 {     public class Program     {         List<int[]> intArrList = new List<int[]>         {             new int[] { 0, 0, 0 },             new int[] { 20, 30, 10, 4, 6 },  //this             new int[] { 1, 2, 5 },             new int[] { 20, 30, 10, 4, 6 },  //this             new int[] { 12, 22, 54 },             new int[] { 1, 2, 6, 7, 8 },             new int[] { 0, 0, 0, 0 }         };          [Benchmark]         public List<int[]> SAkbari_FindDistinctWithoutLinq() => FindDistinctWithoutLinq(intArrList);          [Benchmark]         public List<int[]> SAkbari_FindDistinctWithoutLinq2() => FindDistinctWithoutLinq2(intArrList);          [Benchmark]         public List<int[]> SAkbari_FindDistinctLinq() => FindDistinctLinq(intArrList);          [Benchmark]         public List<int[]> Mick_UsingGetHashCode() => FindDistinctLinq(intArrList);          static void Main(string[] args)         {             var summary = BenchmarkRunner.Run<Program>();         }          public static List<int[]> FindDistinctWithoutLinq(List<int[]> lst)         {             var dic = new Dictionary<string, int[]>();             foreach (var item in lst)             {                 string key = string.Join(",", item.OrderBy(c => c));                  if (!dic.ContainsKey(key))                 {                     dic.Add(key, item);                 }             }              return dic.Values.ToList();         }          public static List<int[]> FindDistinctWithoutLinq2(List<int[]> lst)         {             var dic = new Dictionary<string, int[]>();              foreach (var item in lst)                 dic.TryAdd(string.Join(",", item.OrderBy(c => c)), item);              return dic.Values.ToList();         }          public static List<int[]> FindDistinctLinq(List<int[]> lst)         {             return lst.GroupBy(p => string.Join(", ", p.OrderBy(c => c)))                        .Select(c => c.First().ToArray()).ToList();         }          public static List<int[]> UsingGetHashCode(List<int[]> lst)         {             return lst.Select(a => a.OrderBy(e => e).ToArray()).Distinct(new IntArrayEqualityComparer()).ToList();         }     }      public class IntArrayEqualityComparer : IEqualityComparer<int[]>     {         public bool Equals(int[] x, int[] y)         {             return ArraysEqual(x, y);         }          public int GetHashCode(int[] obj)         {             int hc = obj.Length;             for (int i = 0; i < obj.Length; ++i)             {                 hc = unchecked(hc * 17 + obj[i]);             }             return hc;         }          static bool ArraysEqual<T>(T[] a1, T[] a2)         {             if (ReferenceEquals(a1, a2))                 return true;              if (a1 == null || a2 == null)                 return false;              if (a1.Length != a2.Length)                 return false;              EqualityComparer<T> comparer = EqualityComparer<T>.Default;             for (int i = 0; i < a1.Length; i++)             {                 if (!comparer.Equals(a1[i], a2[i])) return false;             }             return true;         }     } } 
If You Enjoyed This, Take 5 Seconds To Share It

0 comments:

Post a Comment