Skip to content

Commit

Permalink
Add more encodings.
Browse files Browse the repository at this point in the history
  • Loading branch information
Ivan Ivon committed Jul 23, 2021
1 parent c6bdf21 commit c8e593f
Show file tree
Hide file tree
Showing 7 changed files with 500 additions and 39 deletions.
2 changes: 1 addition & 1 deletion Deploy/buildlpx.cmd
Original file line number Diff line number Diff line change
@@ -1,6 +1,6 @@
@echo off

set version=6.15.1
set version=6.16.0
set fileName=CsvLINQPadDriver.%version%.lpx

set zip="%ProgramFiles%\7-Zip\7z.exe"
Expand Down
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -174,8 +174,8 @@ CSV files connection can be added to LINQPad 6/5 the same way as any other conne
* `c:\Books\Books?.csv`: `Books.csv`, `Books1.csv`, etc. files in folder `c:\Books`
* `c:\Books\*.csv`: all `*.csv` files in folder `c:\Books`
* `c:\Books\**.csv`: all `*.csv` files in folder `c:\Books` and its sub-folders.
* Order files by: specify files sort order. Affects similar files order.
* Fallback encoding: specify encoding to use if file encoding could not be detected, e.g. due to missing [BOM](https://en.wikipedia.org/wiki/Byte_order_mark). `UTF-8` is default.
* Order files by: files sort order. Affects similar files order.
* Fallback encoding: [encoding](https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers) to use if file encoding could not be detected. `UTF-8` is default.
* Auto-detect file encodings: try to detect file encodings.
* Validate file paths: check if file paths are valid.
* Ignore files with invalid format: files with content which does not resemble CSV will be ignored.
Expand Down
4 changes: 2 additions & 2 deletions Src/CsvLINQPadDriver/ConnectionDialog.xaml
Original file line number Diff line number Diff line change
Expand Up @@ -327,11 +327,11 @@
<ComboBox Grid.Row="3" Grid.Column="1"
Name="NoBomEncodingComboBox"
Style="{StaticResource FilesGroupComboBoxStyle}"
ToolTip="Specify encoding to use if file encoding could not be detected, e.g. due to missing BOM"
ToolTip="Encoding to use if file encoding could not be detected"
SelectedValue="{Binding NoBomEncoding}"
ItemsSource="{Binding Source={StaticResource NoBomEncodingData}}"/>
<TextBlock Grid.Row="3" Grid.Column="2" Padding="5 5 0 0">
<Hyperlink NavigateUri="https://en.wikipedia.org/wiki/Byte_order_mark" ToolTip="BOM on Wikipedia" Style="{StaticResource HelpHyperlink}">
<Hyperlink NavigateUri="https://docs.microsoft.com/en-us/windows/win32/intl/code-page-identifiers" ToolTip="Code Page Identifiers on Microsoft Docs" Style="{StaticResource HelpHyperlink}">
<TextBlock Style="{StaticResource HelpTextBlock}"/>
</Hyperlink>
</TextBlock>
Expand Down
4 changes: 2 additions & 2 deletions Src/CsvLINQPadDriver/Directory.Build.props
Original file line number Diff line number Diff line change
@@ -1,7 +1,7 @@
<Project>
<PropertyGroup>
<Version>6.15.1</Version>
<PackageReleaseNotes>Updated dependencies.</PackageReleaseNotes>
<Version>6.16.0</Version>
<PackageReleaseNotes>Added more encodings.</PackageReleaseNotes>
</PropertyGroup>

<PropertyGroup>
Expand Down
66 changes: 41 additions & 25 deletions Src/CsvLINQPadDriver/Extensions/FileExtensions.cs
Original file line number Diff line number Diff line change
Expand Up @@ -49,7 +49,7 @@ private static
#endif
StringInternCache = null!;

private static readonly Lazy<IReadOnlyDictionary<NoBomEncoding, Encoding>> NoBomEncodings = new(CalculateNoBomEncodings);
private static readonly Dictionary<NoBomEncoding, Encoding> NoBomEncodings = new();

private record SupportedFileType(FileType FileType, string Extension, string Description)
{
Expand Down Expand Up @@ -82,6 +82,11 @@ public string Mask
public static readonly string DefaultMask = GetMask(DefaultFileType);
public static readonly string DefaultRecursiveMask = GetMask(DefaultFileType, true);

#if NETCOREAPP
static FileExtensions() =>
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif

public static string GetMask(this FileType fileType, bool recursive = false) =>
$"{(recursive ? RecursiveMaskMarker : "*")}.{fileType.GetSupportedFileType().Mask}";

Expand Down Expand Up @@ -369,7 +374,7 @@ private static CsvParser CreateCsvParser(
csvConfiguration.Delimiter = csvSeparator?.ToString() ?? csvConfiguration.Delimiter;
csvConfiguration.BadDataFound = ignoreBadData ? null : csvConfiguration.BadDataFound;

var encoding = (autoDetectEncoding ? DetectEncoding(fileName) : null) ?? NoBomEncodings.Value[noBomEncoding];
var encoding = (autoDetectEncoding ? DetectEncoding(fileName) : null) ?? GetFallbackEncoding(noBomEncoding);

return new CsvParser(new StreamReader(fileName, encoding, !autoDetectEncoding, bufferSize / sizeof(char)), csvConfiguration);
}
Expand Down Expand Up @@ -450,32 +455,43 @@ private static IEnumerable<string[]> CsvReadRows(
}
}

[DllImport("kernel32.dll")]
private static extern int GetSystemDefaultLCID();

[DllImport("kernel32.dll")]
private static extern int GetUserDefaultLCID();

private static IReadOnlyDictionary<NoBomEncoding, Encoding> CalculateNoBomEncodings()
private static Encoding GetFallbackEncoding(NoBomEncoding noBomEncoding)
{
#if NETCOREAPP
Encoding.RegisterProvider(CodePagesEncodingProvider.Instance);
#endif

return new Dictionary<NoBomEncoding, Encoding>
if (!NoBomEncodings.TryGetValue(noBomEncoding, out var encoding))
{
[NoBomEncoding.UTF8] = Encoding.UTF8,
[NoBomEncoding.Unicode] = Encoding.Unicode,
[NoBomEncoding.BigEndianUnicode] = Encoding.BigEndianUnicode,
[NoBomEncoding.UTF32] = Encoding.UTF32,
[NoBomEncoding.BigEndianUTF32] = new UTF32Encoding(true, true),
[NoBomEncoding.ASCII] = Encoding.ASCII,
[NoBomEncoding.SystemCodePage] = GetCodePageEncoding(false),
[NoBomEncoding.UserCodePage] = GetCodePageEncoding(true)
};
NoBomEncodings.Add(noBomEncoding, encoding = GetEncoding());
}

static Encoding GetCodePageEncoding(bool user) =>
Encoding.GetEncoding(CultureInfo.GetCultureInfo(user ? GetUserDefaultLCID() : GetSystemDefaultLCID()).TextInfo.ANSICodePage);
return encoding!;

Encoding GetEncoding()
{
return noBomEncoding switch
{
NoBomEncoding.UTF8 => Encoding.UTF8,
NoBomEncoding.Unicode => Encoding.Unicode,
NoBomEncoding.BigEndianUnicode => Encoding.BigEndianUnicode,
NoBomEncoding.UTF32 => Encoding.UTF32,
NoBomEncoding.BigEndianUTF32 => new UTF32Encoding(true, true),
NoBomEncoding.UTF7 => Encoding.UTF7,
NoBomEncoding.ASCII => Encoding.ASCII,
NoBomEncoding.SystemCodePage => GetCodePageEncoding(false),
NoBomEncoding.UserCodePage => GetCodePageEncoding(true),
_ => Encoding.GetEncoding(FromCodePage())
};

static Encoding GetCodePageEncoding(bool user) =>
Encoding.GetEncoding(CultureInfo.GetCultureInfo(user ? GetUserDefaultLCID() : GetSystemDefaultLCID()).TextInfo.ANSICodePage);

int FromCodePage() =>
Convert.ToInt32(noBomEncoding.ToString()[2..], CultureInfo.InvariantCulture);

[DllImport("kernel32.dll")]
static extern int GetSystemDefaultLCID();

[DllImport("kernel32.dll")]
static extern int GetUserDefaultLCID();
}
}

private static Encoding? DetectEncoding(string fileName)
Expand Down
Loading

0 comments on commit c8e593f

Please sign in to comment.