From 170f1d106ba3620a21ffec602fa41d74d9abf877 Mon Sep 17 00:00:00 2001 From: Enrico Speranza Date: Tue, 5 Sep 2023 15:17:49 +0000 Subject: [PATCH] Add Office Parser Metadata --- README.md | 4 ++-- go.mod | 2 ++ go.sum | 12 ++++++++++++ opencrucible.go | 15 +++++++++++++-- opencrucible_test.go | 32 ++++++++++++++++++++++++++++++-- 5 files changed, 59 insertions(+), 6 deletions(-) diff --git a/README.md b/README.md index e71a9a1..ed1d3b0 100644 --- a/README.md +++ b/README.md @@ -13,6 +13,6 @@ List of formats read: | TXT | X | text/plain; charset=utf-8 | | | RTF | X | text/rtf | | | ODT | X | application/vnd.oasis.opendocument.text | X | -| DOCX | X | application/vnd.openxmlformats-officedocument.wordprocessingml.document | | -| PPTX | X | application/vnd.openxmlformats-officedocument.presentationml.presentation | | +| DOCX | X | application/vnd.openxmlformats-officedocument.wordprocessingml.document | X | +| PPTX | X | application/vnd.openxmlformats-officedocument.presentationml.presentation | X | | PDF | X | application/pdf | X | diff --git a/go.mod b/go.mod index 447f6ff..a6f8617 100644 --- a/go.mod +++ b/go.mod @@ -8,11 +8,13 @@ require ( github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6 // indirect github.com/sirupsen/logrus v1.7.0 // indirect golang.org/x/sys v0.8.0 // indirect + gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c // indirect ) require ( github.com/Tulip-Data/pdf v1.0.2 github.com/flotzilla/pdf_parser v0.1.96 + github.com/gocaio/metagoffice v0.0.0-20190424181953-6f9c150bd74b github.com/gocaio/metagopenoffice v0.0.0-20190424182207-bbc961c10caf github.com/h2non/filetype v1.1.3 github.com/lu4p/cat v0.1.5 diff --git a/go.sum b/go.sum index a4afeae..efe1754 100644 --- a/go.sum +++ b/go.sum @@ -2,26 +2,38 @@ github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6 h1:t27C github.com/EndFirstCorp/peekingReader v0.0.0-20171012052444-257fb6f1a1a6/go.mod h1:zpqkXxDsVfEIUZEWvT9yAo8OmRvSlRrcYQ3Zs8sSubA= github.com/Tulip-Data/pdf v1.0.2 h1:kp7wTAJl+Pq2EuBD9xlwDufzbKu2aQOmR8gDOMrsh0s= github.com/Tulip-Data/pdf v1.0.2/go.mod h1:px/4Told5tJtM+dxsmYTCdi58DkKLLGAHN4KxruWemQ= +github.com/davecgh/go-spew v1.1.1 h1:vj9j/u1bqnvCEfJOwUhtlOARqs3+rkHYY13jYWTU97c= github.com/davecgh/go-spew v1.1.1/go.mod h1:J7Y8YcW2NihsgmVo/mv3lAwl/skON4iLHjSsI+c5H38= github.com/flotzilla/pdf_parser v0.1.96 h1:SlgvO7NZqFzhBO+o6X1u7rUYjhv+81V3dYQF+LTfGOE= github.com/flotzilla/pdf_parser v0.1.96/go.mod h1:/CPB1OWEeFqRbtnFWXgArmOnA3u7smVHxr5dFy4U6Nk= github.com/gabriel-vasile/mimetype v1.1.1/go.mod h1:6CDPel/o/3/s4+bp6kIbsWATq8pmgOisOPG40CJa6To= github.com/gabriel-vasile/mimetype v1.4.2 h1:w5qFW6JKBz9Y393Y4q372O9A7cUSequkh1Q7OhCmWKU= github.com/gabriel-vasile/mimetype v1.4.2/go.mod h1:zApsH/mKG4w07erKIaJPFiX0Tsq9BFQgN3qGY5GnNgA= +github.com/gocaio/metagoffice v0.0.0-20190424181953-6f9c150bd74b h1:JTcrUsCpBvla/FKZbAH0+NvykO8EhhU4oMKNc8NWMOU= +github.com/gocaio/metagoffice v0.0.0-20190424181953-6f9c150bd74b/go.mod h1:TY04sDprSxYfxcvKl+N0vhaFjSgXoKyFqKaRnRXsZUs= github.com/gocaio/metagopenoffice v0.0.0-20190424182207-bbc961c10caf h1:ZGa9zKy3lONMNzsJzgP9LEECjjzFzTSTJllM3GEIq7c= github.com/gocaio/metagopenoffice v0.0.0-20190424182207-bbc961c10caf/go.mod h1:RySwFSDrnowBfrD7tQP1DucjTiKwCMdI+67mMy8YaaM= github.com/h2non/filetype v1.1.3 h1:FKkx9QbD7HR/zjK1Ia5XiBsq9zdLi5Kf3zGyFTAFkGg= github.com/h2non/filetype v1.1.3/go.mod h1:319b3zT68BvV+WRj7cwy856M2ehB3HqNOt6sy1HndBY= +github.com/kr/pretty v0.2.1 h1:Fmg33tUaq4/8ym9TJN1x7sLJnHVwhP33CNkpYV/7rwI= +github.com/kr/pretty v0.2.1/go.mod h1:ipq/a2n7PKx3OHsz4KJII5eveXtPO4qwEXGdVfWzfnI= +github.com/kr/pty v1.1.1/go.mod h1:pFQYn66WHrOpPYNljwOMqo10TkYh1fy3cYio2l3bCsQ= +github.com/kr/text v0.1.0 h1:45sCR5RtlFHMR4UwH9sdQ5TC8v0qDQCHnXt+kaKSTVE= +github.com/kr/text v0.1.0/go.mod h1:4Jbv+DJW3UT/LiOwJeYQe1efqtUx/iVham/4vfdArNI= github.com/lu4p/cat v0.1.5 h1:s51Bp/ns3u6n+hjjL2F77ySY6j/GD5SJG/t6Ok4Y1S0= github.com/lu4p/cat v0.1.5/go.mod h1:G3YRyjSvBipqMBRZ2uLf1oRL3/eGGmuZf96m95Y4jRQ= +github.com/pmezard/go-difflib v1.0.0 h1:4DBwDE0NGyQoBHbLQYPwSUPoCMWR5BEzIk/f1lZbAQM= github.com/pmezard/go-difflib v1.0.0/go.mod h1:iKH77koFhYxTK1pcRnkKkqfTogsbg7gZNVY4sRDYZ/4= github.com/sirupsen/logrus v1.7.0 h1:ShrD1U9pZB12TX0cVy0DtePoCH97K8EtX+mg7ZARUtM= github.com/sirupsen/logrus v1.7.0/go.mod h1:yWOB1SBYBC5VeMP7gHvWumXLIWorT60ONWic61uBYv0= +github.com/stretchr/testify v1.2.2 h1:bSDNvY7ZPG5RlJ8otE/7V6gMiyenm9RtJ7IUVIAoJ1w= github.com/stretchr/testify v1.2.2/go.mod h1:a8OnRcib4nhh0OaRAV+Yts87kKdq0PP7pXfy6kDkUVs= golang.org/x/net v0.10.0 h1:X2//UzNDwYmtCLn7To6G58Wr6f5ahEAQgKNzv9Y951M= golang.org/x/net v0.10.0/go.mod h1:0qNGK6F8kojg2nk9dLZ2mShWaEBan6FAoqfSigmmuDg= golang.org/x/sys v0.0.0-20191026070338-33540a1f6037/go.mod h1:h1NjWce9XRLGQEsW7wpKNCjG9DtNlClVuFLEZdDNbEs= golang.org/x/sys v0.8.0 h1:EBmGv8NaZBZTWvrbjNoL6HVt+IVy3QDQpJs7VRIw3tU= golang.org/x/sys v0.8.0/go.mod h1:oPkhp1MJrh7nUepCBck5+mAzfO9JrbApNNgaTdGDITg= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c h1:Hei/4ADfdWqJk1ZMxUNpqntNwaWcugrBjAiHlqqRiVk= +gopkg.in/check.v1 v1.0.0-20201130134442-10cb98267c6c/go.mod h1:JHkPIbrfpd72SG/EVd6muEfDQjcINNoR0C8j2r3qZ4Q= gopkg.in/xmlpath.v2 v2.0.0-20150820204837-860cbeca3ebc h1:LMEBgNcZUqXaP7evD1PZcL6EcDVa2QOFuI+cqM3+AJM= gopkg.in/xmlpath.v2 v2.0.0-20150820204837-860cbeca3ebc/go.mod h1:N8UOSI6/c2yOpa/XDz3KVUiegocTziPiqNkeNTMiG1k= diff --git a/opencrucible.go b/opencrucible.go index f9a7ec2..eed8335 100644 --- a/opencrucible.go +++ b/opencrucible.go @@ -10,13 +10,14 @@ import ( "github.com/Tulip-Data/pdf" "github.com/flotzilla/pdf_parser" "github.com/gabriel-vasile/mimetype" + "github.com/gocaio/metagoffice" "github.com/gocaio/metagopenoffice" "github.com/h2non/filetype" "github.com/lu4p/cat" ) // Version exposes the current package version. -const Version = "0.0.5" +const Version = "0.0.6" //Detects @@ -155,7 +156,7 @@ func PDFFileMetadata(FileToParse string) (*pdf_parser.PdfInfo, error) { } // See for return: https://stackoverflow.com/questions/50697914/return-nil-for-a-struct-in-go -func ODTFileMetadata(FileToParse string) (*metagopenoffice.OpenOfficeXML, error) { +func OpenOfficeFileMetadata(FileToParse string) (*metagopenoffice.OpenOfficeXML, error) { file, err := os.Open(FileToParse) if err != nil { return nil, fmt.Errorf("error opening file: %s", err) @@ -164,3 +165,13 @@ func ODTFileMetadata(FileToParse string) (*metagopenoffice.OpenOfficeXML, error) content, err := metagopenoffice.GetMetada(file) return &content, err } + +func OfficeFileMetadata(FileToParse string) (*metagoffice.XMLContent, error) { + file, err := os.Open(FileToParse) + if err != nil { + return nil, fmt.Errorf("error opening file: %s", err) + } + file.Close() + content, err := metagoffice.GetContent(file) + return &content, err +} diff --git a/opencrucible_test.go b/opencrucible_test.go index 479bf3a..e2aee24 100644 --- a/opencrucible_test.go +++ b/opencrucible_test.go @@ -62,8 +62,8 @@ func TestODTFileParser(t *testing.T) { } } -func TestODTMetadata(t *testing.T) { - got, err := ODTFileMetadata(filepath.Join("test_file", "test_file_odt.odt")) +func TestOpenOfficeMetadata(t *testing.T) { + got, err := OpenOfficeFileMetadata(filepath.Join("test_file", "test_file_odt.odt")) if err != nil { t.Errorf("error loading file \n %s", err) } @@ -156,6 +156,34 @@ func TestPPTXFileParser(t *testing.T) { } } +func TestOfficeMetadata(t *testing.T) { + got, err := OfficeFileMetadata(filepath.Join("test_file", "test_file_docx_ms.docx")) + if err != nil { + t.Errorf("error loading file \n %s", err) + } + want := "Enrico Speranza" + t.Logf("Parsed: %s", got.Title) + t.Logf("Parsed: %s", got.Created) + t.Logf("Parsed: %s", got.LastModifiedBy) + if got.Creator != want { + t.Errorf("got %q, wanted %q", got.Creator, want) + } +} + +func TestOfficePPTXMetadata(t *testing.T) { + got, err := OfficeFileMetadata(filepath.Join("test_file", "test_file_pptx.pptx")) + if err != nil { + t.Errorf("error loading file \n %s", err) + } + want := "Presentazione standard di PowerPoint" + t.Logf("Parsed: %s", got.Title) + t.Logf("Parsed: %s", got.Created) + t.Logf("Parsed: %s", got.LastModifiedBy) + if got.Title != want { + t.Errorf("got %q, wanted %q", got.Title, want) + } +} + func TestPDFMetdata(t *testing.T) { got, err := PDFFileMetadata(filepath.Join("test_file", "test_file_pdf.pdf")) if err != nil {