Skip to content

Commit

Permalink
Add encoding auto-detection. (#4)
Browse files Browse the repository at this point in the history
Bugfixes.

Co-authored-by: Ivan Ivon <ivan.ivon@zerto.com>
  • Loading branch information
i2van and Ivan Ivon authored Jun 25, 2021
1 parent 6213956 commit 05841da
Show file tree
Hide file tree
Showing 24 changed files with 342 additions and 136 deletions.
2 changes: 1 addition & 1 deletion Deploy/buildlpx.cmd
Original file line number Diff line number Diff line change
@@ -1,4 +1,4 @@
@set version=6.12.0
@set version=6.13.0
@set zip="%ProgramFiles%\7-Zip\7z.exe"
@set output="CsvLINQPadDriver.%version%.lpx6"

Expand Down
8 changes: 5 additions & 3 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -159,15 +159,16 @@ CSV files connection can be added to LINQPad 6 the same way as any other connect
* `c:\Books\Books?.csv`: `Books.csv`, `Books1.csv`, etc. files in folder `c:\Books`
* `c:\Books\*.csv`: all `*.csv` files in folder `c:\Books`
* `c:\Books\**.csv`: all `*.csv` files in folder `c:\Books` and its sub-folders.
* Order files by: specifies files sort order. Affects similar files order.
* No BOM encoding: specifies encoding for files without [BOM](https://en.wikipedia.org/wiki/Byte_order_mark). `UTF-8` is default.
* Order files by: specify files sort order. Affects similar files order.
* Fallback encoding: specify encoding to use if file encoding could not be detected, e.g. due to missing [BOM](https://en.wikipedia.org/wiki/Byte_order_mark). `UTF-8` is default.
* Auto-detect file encodings: try to detect file encodings.
* Validate file paths: check if file paths are valid.
* Ignore files with invalid format: files with content which does not resemble CSV will be ignored.

### Format ###

* CSV separator: character used to separate columns in files. Can be `,`, `\t`, etc. Auto-detected if empty.
* Use CsvHelper library separator auto-detection: use CsvHelper library separator auto-detection instead of internal one.
* Use [CsvHelper](https://joshclose.github.io/CsvHelper) library separator auto-detection: use CsvHelper library separator auto-detection instead of internal one.
* Ignore bad data: ignore malformed CSV data.
* Allow comments: lines starting with `#` will be ignored.

Expand Down Expand Up @@ -402,6 +403,7 @@ TimeSpan? ToTimeSpan(
* [Moq](https://github.com/moq/moq4)
* [NUnit](https://github.com/nunit/nunit)
* [Windows API Code Pack](https://github.com/contre/Windows-API-Code-Pack-1.1)
* [UnicodeCharsetDetector](https://github.com/i2van/UnicodeCharsetDetector)
## License ##

Expand Down
3 changes: 2 additions & 1 deletion Src/CsvLINQPadDriver/CodeGen/CsvCSharpCodeGenerator.cs
Original file line number Diff line number Diff line change
Expand Up @@ -68,6 +68,7 @@ public class {_contextTypeName} : {typeof(CsvDataContextBase).GetCodeTypeClassNa
{typeof(NoBomEncoding).GetCodeTypeClassName()}.{_properties.NoBomEncoding},
{GetBoolConst(_properties.AllowComments)},
{GetBoolConst(_properties.IgnoreBadData)},
{GetBoolConst(_properties.AutoDetectEncoding)},
{table.FilePath.AsValidCSharpCode()},
new {typeof(CsvColumnInfoList<>).GetCodeTypeClassName(GetClassName(table))} {{
{string.Join(string.Empty, table.Columns.Select(c => $@"{{ {c.Index}, x => x.{c.CodeName} }}, "))}
Expand All @@ -81,7 +82,7 @@ public class {_contextTypeName} : {typeof(CsvDataContextBase).GetCodeTypeClassNa
}}
}} // context class

// Data types {string.Join(Environment.NewLine, groups.Select(grouping => grouping.First().Code))} // data types
// Data types {string.Join(Environment.NewLine, groups.Select(grouping => grouping.OrderByDescending(code => code.Code.Length).First().Code))} // data types
}} // namespace
", groups);

Expand Down
5 changes: 4 additions & 1 deletion Src/CsvLINQPadDriver/CodeGen/CsvTableBase.cs
Original file line number Diff line number Diff line change
Expand Up @@ -25,6 +25,7 @@ public abstract class CsvTableBase<TRow> : CsvTableBase, IEnumerable<TRow>
private readonly NoBomEncoding _noBomEncoding;
private readonly bool _allowComments;
private readonly bool _ignoreBadData;
private readonly bool _autoDetectEncoding;

protected readonly string FilePath;

Expand All @@ -34,6 +35,7 @@ protected CsvTableBase(
NoBomEncoding noBomEncoding,
bool allowComments,
bool ignoreBadData,
bool autoDetectEncoding,
string filePath,
IEnumerable<CsvColumnInfo> propertiesInfo,
Action<TRow> relationsInit)
Expand All @@ -43,14 +45,15 @@ protected CsvTableBase(
_noBomEncoding = noBomEncoding;
_allowComments = allowComments;
_ignoreBadData = ignoreBadData;
_autoDetectEncoding = autoDetectEncoding;

FilePath = filePath;

_cachedCsvRowMappingBase ??= new CsvRowMappingBase<TRow>(propertiesInfo, relationsInit);
}

protected IEnumerable<TRow> ReadData() =>
FilePath.CsvReadRows(_csvSeparator, IsStringInternEnabled, _noBomEncoding, _allowComments, _ignoreBadData, _cachedCsvRowMappingBase!);
FilePath.CsvReadRows(_csvSeparator, IsStringInternEnabled, _noBomEncoding, _allowComments, _ignoreBadData, _autoDetectEncoding, _cachedCsvRowMappingBase!);

// ReSharper disable once UnusedMember.Global
public abstract IEnumerable<TRow> WhereIndexed(Func<TRow, string> getProperty, string propertyName, params string[] values);
Expand Down
3 changes: 2 additions & 1 deletion Src/CsvLINQPadDriver/CodeGen/CsvTableEnumerable.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,10 +13,11 @@ public CsvTableEnumerable(
NoBomEncoding noBomEncoding,
bool allowComments,
bool ignoreBadData,
bool autoDetectEncoding,
string filePath,
IEnumerable<CsvColumnInfo> propertiesInfo,
Action<TRow> relationsInit)
: base(isStringInternEnabled, csvSeparator, noBomEncoding, allowComments, ignoreBadData, filePath, propertiesInfo, relationsInit)
: base(isStringInternEnabled, csvSeparator, noBomEncoding, allowComments, ignoreBadData, autoDetectEncoding, filePath, propertiesInfo, relationsInit)
{
}

Expand Down
5 changes: 3 additions & 2 deletions Src/CsvLINQPadDriver/CodeGen/CsvTableFactory.cs
Original file line number Diff line number Diff line change
Expand Up @@ -13,12 +13,13 @@ public static CsvTableBase<TRow> CreateTable<TRow>(
NoBomEncoding noBomEncoding,
bool allowComments,
bool ignoreBadData,
bool autoDetectEncoding,
string filePath,
IEnumerable<CsvColumnInfo> propertiesInfo,
Action<TRow> relationsInit)
where TRow : ICsvRowBase, new() =>
isCacheEnabled
? new CsvTableList<TRow>(isStringInternEnabled, csvSeparator, noBomEncoding, allowComments, ignoreBadData, filePath, propertiesInfo, relationsInit)
: new CsvTableEnumerable<TRow>(isStringInternEnabled, csvSeparator, noBomEncoding, allowComments, ignoreBadData, filePath, propertiesInfo, relationsInit);
? new CsvTableList<TRow>(isStringInternEnabled, csvSeparator, noBomEncoding, allowComments, ignoreBadData, autoDetectEncoding, filePath, propertiesInfo, relationsInit)
: new CsvTableEnumerable<TRow>(isStringInternEnabled, csvSeparator, noBomEncoding, allowComments, ignoreBadData, autoDetectEncoding, filePath, propertiesInfo, relationsInit);
}
}
3 changes: 2 additions & 1 deletion Src/CsvLINQPadDriver/CodeGen/CsvTableList.cs
Original file line number Diff line number Diff line change
Expand Up @@ -18,10 +18,11 @@ public CsvTableList(
NoBomEncoding noBomEncoding,
bool allowComments,
bool ignoreBadData,
bool autoDetectEncoding,
string filePath,
IEnumerable<CsvColumnInfo> propertiesInfo,
Action<TRow> relationsInit)
: base(isStringInternEnabled, csvSeparator, noBomEncoding, allowComments, ignoreBadData, filePath, propertiesInfo, relationsInit) =>
: base(isStringInternEnabled, csvSeparator, noBomEncoding, allowComments, ignoreBadData, autoDetectEncoding, filePath, propertiesInfo, relationsInit) =>
_dataCache = new Lazy<IList<TRow>>(() => ReadData().Cache($"{typeof(TRow).Name}:{FilePath}"));

private IList<TRow> DataCache =>
Expand Down
Loading

0 comments on commit 05841da

Please sign in to comment.