Skip to content

Commit

Permalink
v0.2.5
Browse files Browse the repository at this point in the history
  • Loading branch information
shenwei356 committed Apr 11, 2016
1 parent 23b6974 commit 326281f
Show file tree
Hide file tree
Showing 10 changed files with 369 additions and 17 deletions.
5 changes: 4 additions & 1 deletion README.md
Original file line number Diff line number Diff line change
Expand Up @@ -28,11 +28,12 @@ You can also add the directory of the executable file to environment variable

or simply copy it to `/usr/local/bin`

## Subcommands (16 in total)
## Subcommands (18 in total)

**Information**

- `stat` summary of CSV file
- `stat2` summary of selected number fields

**Format convertion**

Expand Down Expand Up @@ -92,6 +93,8 @@ to be continued...
starts with `#`, please assign `-C` another rare symbol, e.g. `&`.
4. By default, csvtk handles CSV files, use `-t` for tab-delimited files.

More [examples](http://shenwei356.github.io/csvtk/usage/) and [tutorial](http://shenwei356.github.io/csvtk/tutorial/)

Examples

1. Select fields/columns (`cut`)
Expand Down
2 changes: 1 addition & 1 deletion csvtk/cmd/cut.go
Original file line number Diff line number Diff line change
Expand Up @@ -47,7 +47,7 @@ var cutCmd = &cobra.Command{

printNames := getFlagBool(cmd, "colnames")
if printNames && config.NoHeaderRow {
checkError(fmt.Errorf("flag -n (--colnames) and -T (--no-header-row) should not given both"))
checkError(fmt.Errorf("flag -n (--colnames) and -H (--no-header-row) should not given both"))
}

fieldStr := getFlagString(cmd, "fields")
Expand Down
14 changes: 13 additions & 1 deletion csvtk/cmd/helper.go
Original file line number Diff line number Diff line change
Expand Up @@ -237,7 +237,7 @@ func NewCSVWriterChanByConfig(config Config) (chan []string, error) {
}

var reFields = regexp.MustCompile(`([^,]+)(,[^,]+)*,?`)
var reDigitals = regexp.MustCompile(`^[\-\d]+$`)
var reDigitals = regexp.MustCompile(`^[\-\d\.e,E\+]+$`)
var reDigitalRange = regexp.MustCompile(`^([\-\d]+?)\-([\-\d]+?)$`)

func getFlagFields(cmd *cobra.Command, flag string) string {
Expand Down Expand Up @@ -453,3 +453,15 @@ func parseCSVfile(cmd *cobra.Command, config Config, file string,
}
return HeaderRow, Data, fields
}

func removeComma(s string) string {
newSlice := []byte{}
for i:=0; i<len(s); i++ {
switch s[i] {
case ',':
default:
newSlice = append(newSlice, s[i])
}
}
return string(newSlice)
}
2 changes: 1 addition & 1 deletion csvtk/cmd/root.go
Original file line number Diff line number Diff line change
Expand Up @@ -34,7 +34,7 @@ var RootCmd = &cobra.Command{
Short: "Another cross-platform, efficient and practical CSV/TSV toolkit",
Long: `Another cross-platform, efficient and practical CSV/TSV toolkit
Version: 0.2.4
Version: 0.2.5
Author: Wei Shen <shenwei356@gmail.com>
Expand Down
22 changes: 20 additions & 2 deletions csvtk/cmd/stat.go
Original file line number Diff line number Diff line change
Expand Up @@ -21,11 +21,13 @@
package cmd

import (
"fmt"
"runtime"

"github.com/brentp/xopen"
"github.com/spf13/cobra"

"github.com/tatsushid/go-prettytable"
"github.com/dustin/go-humanize"
)

// statCmd represents the seq command
Expand All @@ -44,6 +46,14 @@ var statCmd = &cobra.Command{
checkError(err)
defer outfh.Close()


tbl, err := prettytable.NewTable([]prettytable.Column{
{Header: "file"},
{Header: "num_cols", AlignRight: true},
{Header: "num_rows", AlignRight: true}}...)
checkError(err)
tbl.Separator = " "

for _, file := range files {
csvReader, err := newCSVReaderByConfig(config, file)
checkError(err)
Expand All @@ -63,8 +73,16 @@ var statCmd = &cobra.Command{
once = false
}
}
outfh.WriteString(fmt.Sprintf("file: %s num_cols: %d num_rows: %d\n", file, numCols, numRows))
if !config.NoHeaderRow {
numRows--
}
tbl.AddRow(
file,
humanize.Comma(int64(numCols)),
humanize.Comma(int64(numRows)))

}
outfh.Write(tbl.Bytes())
},
}

Expand Down
225 changes: 225 additions & 0 deletions csvtk/cmd/stat2.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
// Copyright © 2016 Wei Shen <shenwei356@gmail.com>
//
// Permission is hereby granted, free of charge, to any person obtaining a copy
// of this software and associated documentation files (the "Software"), to deal
// in the Software without restriction, including without limitation the rights
// to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
// copies of the Software, and to permit persons to whom the Software is
// furnished to do so, subject to the following conditions:
//
// The above copyright notice and this permission notice shall be included in
// all copies or substantial portions of the Software.
//
// THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
// IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
// FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT. IN NO EVENT SHALL THE
// AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
// LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
// OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
// THE SOFTWARE.

package cmd

import (
"fmt"
"regexp"
"runtime"
"strconv"
"sort"

"github.com/brentp/xopen"
"github.com/spf13/cobra"
"github.com/gonum/floats"
"github.com/tatsushid/go-prettytable"
"github.com/dustin/go-humanize"
"github.com/gonum/stat"
"github.com/shenwei356/util/math"
)

// stat2Cmd represents the seq command
var stat2Cmd = &cobra.Command{
Use: "stat2",
Short: "summary of selected number fields",
Long: `summary of selected number fields: num, sum, min, max, mean, stdev
`,
Run: func(cmd *cobra.Command, args []string) {
config := getConfigs(cmd)
files := getFileList(args)
if len(files) > 1 {
checkError(fmt.Errorf("no more than one file should be given"))
}
runtime.GOMAXPROCS(config.NumCPUs)

fieldStr := getFlagString(cmd, "fields")
if fieldStr == "" {
checkError(fmt.Errorf("flag -f (--field) needed"))
}

fuzzyFields := getFlagBool(cmd, "fuzzy-fields")
fields, colnames, negativeFields, needParseHeaderRow := parseFields(cmd, fieldStr, config.NoHeaderRow)
var fieldsMap map[int]struct{}
if len(fields) > 0 {
fields2 := make([]int, len(fields))
fieldsMap = make(map[int]struct{}, len(fields))
for i, f := range fields {
if negativeFields {
fieldsMap[f*-1] = struct{}{}
fields2[i] = f * -1
} else {
fieldsMap[f] = struct{}{}
fields2[i] = f
}
}
fields = fields2
}

outfh, err := xopen.Wopen(config.OutFile)
checkError(err)
defer outfh.Close()


file := files[0]
csvReader, err := newCSVReaderByConfig(config, file)
checkError(err)
csvReader.Run()

parseHeaderRow := needParseHeaderRow // parsing header row
var colnames2fileds map[string]int // column name -> field
var colnamesMap map[string]*regexp.Regexp
var HeaderRow []string
var isHeaderRow bool

checkFields := true

data := make(map[int][]float64)

for chunk := range csvReader.Ch {
checkError(chunk.Err)

for _, record := range chunk.Data {
if parseHeaderRow { // parsing header row
colnames2fileds = make(map[string]int, len(record))
for i, col := range record {
colnames2fileds[col] = i + 1
}
colnamesMap = make(map[string]*regexp.Regexp, len(colnames))
for _, col := range colnames {
if negativeFields {
colnamesMap[col[1:]] = fuzzyField2Regexp(col)
} else {
colnamesMap[col] = fuzzyField2Regexp(col)
}
}

if len(fields) == 0 { // user gives the colnames
fields = []int{}
for _, col := range record {
var ok bool
if fuzzyFields {
for _, re := range colnamesMap {
if re.MatchString(col) {
ok = true
break
}
}
} else {
_, ok = colnamesMap[col]
}
if (negativeFields && !ok) || (!negativeFields && ok) {
fields = append(fields, colnames2fileds[col])
}
}
}

fieldsMap = make(map[int]struct{}, len(fields))
for _, f := range fields {
fieldsMap[f] = struct{}{}
}

HeaderRow = record
parseHeaderRow = false
isHeaderRow = true
}
if checkFields {
fields2 := []int{}
for f := range record {
_, ok := fieldsMap[f+1]
if negativeFields {
if !ok {
fields2 = append(fields2, f+1)
}
} else {
if ok {
fields2 = append(fields2, f+1)
}
}
}
fields = fields2
if len(fields) == 0 {
checkError(fmt.Errorf("no fields matched in file: %s", file))
}

checkFields = false
}

if isHeaderRow {
isHeaderRow = false
continue
}
for _, f := range fields {
if !reDigitals.MatchString(record[f-1]) {
checkError(fmt.Errorf("column %d has non-number data: %s", f, record[f-1]))
}
v, e := strconv.ParseFloat(removeComma(record[f-1]), 64)
checkError(e)
if _, ok := data[f]; !ok {
data[f] = []float64{}
}
data[f] = append(data[f], v)
}
}
}
tbl, err := prettytable.NewTable([]prettytable.Column{
{Header: "field"},
{Header: "num", AlignRight: true},
{Header: "sum", AlignRight: true},
{Header: "min", AlignRight: true},
{Header: "max", AlignRight: true},
{Header: "mean", AlignRight: true},
{Header: "stdev", AlignRight: true}}...)
checkError(err)
tbl.Separator = " "

fields = []int{}
for f := range data {
fields = append(fields, f)
}
sort.Ints(fields)

var fieldS string
for _, f := range fields {
if needParseHeaderRow {
fieldS = HeaderRow[f-1]
} else {
fieldS = fmt.Sprintf("%d", f)
}
mean, stdev := stat.MeanStdDev(data[f], nil)
tbl.AddRow(
fieldS,
humanize.Comma(int64(len(data[f]))),
humanize.Commaf(math.Round(floats.Sum(data[f]),2)),
humanize.Commaf(math.Round(floats.Min(data[f]),2)),
humanize.Commaf(math.Round(floats.Max(data[f]),2)),
humanize.Commaf(math.Round(mean,2)),
humanize.Commaf(math.Round(stdev,2)))
}
outfh.Write(tbl.Bytes())
},
}

func init() {
RootCmd.AddCommand(stat2Cmd)
stat2Cmd.Flags().StringP("fields", "f", "", `select only these fields. e.g -f 1,2 or -f columnA,columnB`)
stat2Cmd.Flags().BoolP("fuzzy-fields", "F", false, `using fuzzy fields, e.g. *name or id123*`)
}
10 changes: 7 additions & 3 deletions doc/docs/download.md
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,10 @@

## Current Version

- [csvtk v0.2.4](https://github.com/shenwei356/csvtk/releases/tag/v0.2.4)
- fix bug of handling comment lines
- add some notes before using csvtk
- [csvtk v0.2.5](https://github.com/shenwei356/csvtk/releases/tag/v0.2.5)
- fix bug of `stat` that failed to considerate files with header row
- add subcommand `stat2` - summary of selected number fields
- make the output of `stat` prettier

## Installation

Expand All @@ -32,6 +33,9 @@ You can also add the directory of the executable file to environment variable

## Previous Versions

- [csvtk v0.2.4](https://github.com/shenwei356/csvtk/releases/tag/v0.2.4)
- fix bug of handling comment lines
- add some notes before using csvtk
- [csvtk v0.2.3](https://github.com/shenwei356/csvtk/releases/tag/v0.2.3)
- add flag `--colnames` to `cut`
- flag `-f` (`--fields`) of `join` supports single value now
Expand Down
Loading

0 comments on commit 326281f

Please sign in to comment.