Skip to content

Commit

Permalink
Add Office Parser Metadata
Browse files Browse the repository at this point in the history
  • Loading branch information
Vytek committed Sep 5, 2023
1 parent 694cd04 commit 170f1d1
Show file tree
Hide file tree
Showing 5 changed files with 59 additions and 6 deletions.
4 changes: 2 additions & 2 deletions README.md
Original file line number Diff line number Diff line change
Expand Up @@ -13,6 +13,6 @@ List of formats read:
| TXT | X | text/plain; charset=utf-8 | |
| RTF | X | text/rtf | |
| ODT | X | application/vnd.oasis.opendocument.text | X |
| DOCX | X | application/vnd.openxmlformats-officedocument.wordprocessingml.document | |
| PPTX | X | application/vnd.openxmlformats-officedocument.presentationml.presentation | |
| DOCX | X | application/vnd.openxmlformats-officedocument.wordprocessingml.document | X |
| PPTX | X | application/vnd.openxmlformats-officedocument.presentationml.presentation | X |
| PDF | X | application/pdf | X |
2 changes: 2 additions & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -8,11 +8,13 @@ require (
github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6 // indirect
github.com/sirupsen/logrus v1.7.0 // indirect
golang.org/x/sys v0.8.0 // indirect
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect
)

require (
github.com/Tulip-Data/pdf v1.0.2
github.com/flotzilla/pdf_parser v0.1.96
github.com/gocaio/metagoffice v0.0.0-20190424181953-6f9c150bd74b
github.com/gocaio/metagopenoffice v0.0.0-20190424182207-bbc961c10caf
github.com/h2non/filetype v1.1.3
github.com/lu4p/cat v0.1.5
Expand Down
12 changes: 12 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -2,26 +2,38 @@ github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6 h1:t27C
github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6/go.mod h1:zpqkXxDsVfEIUZEWvT9yAo8OmRvSlRrcYQ3Zs8sSubA=
github.com/Tulip-Data/pdf v1.0.2 h1:kp7wTAJl+Pq2EuBD9xlwDufzbKu2aQOmR8gDOMrsh0s=
github.com/Tulip-Data/pdf v1.0.2/go.mod h1:px/4Told5tJtM+dxsmYTCdi58DkKLLGAHN4KxruWemQ=
github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c=
github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38=
github.com/flotzilla/pdf_parser v0.1.96 h1:SlgvO7NZqFzhBO+o6X1u7rUYjhv+81V3dYQF+LTfGOE=
github.com/flotzilla/pdf_parser v0.1.96/go.mod h1:/CPB1OWEeFqRbtnFWXgArmOnA3u7smVHxr5dFy4U6Nk=
github.com/gabriel-vasile/mimetype v1.1.1/go.mod h1:6CDPel/o/3/s4+bp6kIbsWATq8pmgOisOPG40CJa6To=
github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU=
github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA=
github.com/gocaio/metagoffice v0.0.0-20190424181953-6f9c150bd74b h1:JTcrUsCpBvla/FKZbAH0+NvykO8EhhU4oMKNc8NWMOU=
github.com/gocaio/metagoffice v0.0.0-20190424181953-6f9c150bd74b/go.mod h1:TY04sDprSxYfxcvKl+N0vhaFjSgXoKyFqKaRnRXsZUs=
github.com/gocaio/metagopenoffice v0.0.0-20190424182207-bbc961c10caf h1:ZGa9zKy3lONMNzsJzgP9LEECjjzFzTSTJllM3GEIq7c=
github.com/gocaio/metagopenoffice v0.0.0-20190424182207-bbc961c10caf/go.mod h1:RySwFSDrnowBfrD7tQP1DucjTiKwCMdI+67mMy8YaaM=
github.com/h2non/filetype v1.1.3 h1:FKkx9QbD7HR/zjK1Ia5XiBsq9zdLi5Kf3zGyFTAFkGg=
github.com/h2non/filetype v1.1.3/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY=
github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI=
github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI=
github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ=
github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE=
github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI=
github.com/lu4p/cat v0.1.5 h1:s51Bp/ns3u6n+hjjL2F77ySY6j/GD5SJG/t6Ok4Y1S0=
github.com/lu4p/cat v0.1.5/go.mod h1:G3YRyjSvBipqMBRZ2uLf1oRL3/eGGmuZf96m95Y4jRQ=
github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM=
github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4=
github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM=
github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0=
github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w=
github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs=
golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M=
golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg=
golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs=
golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU=
golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk=
gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q=
gopkg.in/xmlpath.v2 v2.0.0-20150820204837-860cbeca3ebc h1:LMEBgNcZUqXaP7evD1PZcL6EcDVa2QOFuI+cqM3+AJM=
gopkg.in/xmlpath.v2 v2.0.0-20150820204837-860cbeca3ebc/go.mod h1:N8UOSI6/c2yOpa/XDz3KVUiegocTziPiqNkeNTMiG1k=
15 changes: 13 additions & 2 deletions opencrucible.go
Original file line number Diff line number Diff line change
Expand Up @@ -10,13 +10,14 @@ import (
"github.com/Tulip-Data/pdf"
"github.com/flotzilla/pdf_parser"
"github.com/gabriel-vasile/mimetype"
"github.com/gocaio/metagoffice"
"github.com/gocaio/metagopenoffice"
"github.com/h2non/filetype"
"github.com/lu4p/cat"
)

// Version exposes the current package version.
const Version = "0.0.5"
const Version = "0.0.6"

//Detects

Expand Down Expand Up @@ -155,7 +156,7 @@ func PDFFileMetadata(FileToParse string) (*pdf_parser.PdfInfo, error) {
}

// See for return: https://stackoverflow.com/questions/50697914/return-nil-for-a-struct-in-go
func ODTFileMetadata(FileToParse string) (*metagopenoffice.OpenOfficeXML, error) {
func OpenOfficeFileMetadata(FileToParse string) (*metagopenoffice.OpenOfficeXML, error) {
file, err := os.Open(FileToParse)
if err != nil {
return nil, fmt.Errorf("error opening file: %s", err)
Expand All @@ -164,3 +165,13 @@ func ODTFileMetadata(FileToParse string) (*metagopenoffice.OpenOfficeXML, error)
content, err := metagopenoffice.GetMetada(file)
return &content, err
}

func OfficeFileMetadata(FileToParse string) (*metagoffice.XMLContent, error) {
file, err := os.Open(FileToParse)
if err != nil {
return nil, fmt.Errorf("error opening file: %s", err)
}
file.Close()
content, err := metagoffice.GetContent(file)
return &content, err
}
32 changes: 30 additions & 2 deletions opencrucible_test.go
Original file line number Diff line number Diff line change
Expand Up @@ -62,8 +62,8 @@ func TestODTFileParser(t *testing.T) {
}
}

func TestODTMetadata(t *testing.T) {
got, err := ODTFileMetadata(filepath.Join("test_file", "test_file_odt.odt"))
func TestOpenOfficeMetadata(t *testing.T) {
got, err := OpenOfficeFileMetadata(filepath.Join("test_file", "test_file_odt.odt"))
if err != nil {
t.Errorf("error loading file \n %s", err)
}
Expand Down Expand Up @@ -156,6 +156,34 @@ func TestPPTXFileParser(t *testing.T) {
}
}

func TestOfficeMetadata(t *testing.T) {
got, err := OfficeFileMetadata(filepath.Join("test_file", "test_file_docx_ms.docx"))
if err != nil {
t.Errorf("error loading file \n %s", err)
}
want := "Enrico Speranza"
t.Logf("Parsed: %s", got.Title)
t.Logf("Parsed: %s", got.Created)
t.Logf("Parsed: %s", got.LastModifiedBy)
if got.Creator != want {
t.Errorf("got %q, wanted %q", got.Creator, want)
}
}

func TestOfficePPTXMetadata(t *testing.T) {
got, err := OfficeFileMetadata(filepath.Join("test_file", "test_file_pptx.pptx"))
if err != nil {
t.Errorf("error loading file \n %s", err)
}
want := "Presentazione standard di PowerPoint"
t.Logf("Parsed: %s", got.Title)
t.Logf("Parsed: %s", got.Created)
t.Logf("Parsed: %s", got.LastModifiedBy)
if got.Title != want {
t.Errorf("got %q, wanted %q", got.Title, want)
}
}

func TestPDFMetdata(t *testing.T) {
got, err := PDFFileMetadata(filepath.Join("test_file", "test_file_pdf.pdf"))
if err != nil {
Expand Down

0 comments on commit 170f1d1

Please sign in to comment.