Skip to content

Commit

Permalink
Dependency ingester (#5058)
Browse files Browse the repository at this point in the history
* Add repo dependency ingester

* Plug components, fix panic

This commit connects the components defined in the previous commits
and fixes a panic when reading the configuration.

Signed-off-by: Adolfo García Veytia (Puerco) <puerco@stacklok.com>

* Add initial deps ingester tests

Signed-off-by: Adolfo García Veytia (Puerco) <puerco@stacklok.com>

* make gen

Signed-off-by: Adolfo García Veytia (Puerco) <puerco@stacklok.com>

---------

Signed-off-by: Adolfo García Veytia (Puerco) <puerco@stacklok.com>
Co-authored-by: Evan Anderson <evan@stacklok.com>
  • Loading branch information
puerco and evankanderson authored Nov 27, 2024
1 parent 27464ee commit f6afcd4
Show file tree
Hide file tree
Showing 9 changed files with 3,116 additions and 2,517 deletions.
31 changes: 28 additions & 3 deletions docs/docs/ref/proto.md

Some generated files are not rendered by default. Learn more about how customized files appear on GitHub.

1 change: 1 addition & 0 deletions go.mod
Original file line number Diff line number Diff line change
Expand Up @@ -57,6 +57,7 @@ require (
github.com/openfga/openfga v1.8.0
github.com/pkg/browser v0.0.0-20240102092130-5ac0b6a4141c
github.com/prometheus/client_golang v1.20.5
github.com/protobom/protobom v0.5.0
github.com/puzpuzpuz/xsync/v3 v3.4.0
github.com/robfig/cron/v3 v3.0.1
github.com/rs/zerolog v1.33.0
Expand Down
2 changes: 2 additions & 0 deletions go.sum
Original file line number Diff line number Diff line change
Expand Up @@ -913,6 +913,8 @@ github.com/prometheus/procfs v0.0.0-20181005140218-185b4288413d/go.mod h1:c3At6R
github.com/prometheus/procfs v0.0.2/go.mod h1:TjEm7ze935MbeOT/UhFTIMYKhuLP4wbCsTZCD3I8kEA=
github.com/prometheus/procfs v0.15.1 h1:YagwOFzUgYfKKHX6Dr+sHT7km/hxC76UB0learggepc=
github.com/prometheus/procfs v0.15.1/go.mod h1:fB45yRUv8NstnjriLhBQLuOUt+WW4BsoGhij/e3PBqk=
github.com/protobom/protobom v0.5.0 h1:jJYqGpdHq99zwh0/n1SOPl1aickCBZdA8pHS9V/f+XQ=
github.com/protobom/protobom v0.5.0/go.mod h1:HL47tggz7SXYXgNm3WjQQrWB6iOirYnrATsXAEyTUkI=
github.com/puzpuzpuz/xsync v1.5.2 h1:yRAP4wqSOZG+/4pxJ08fPTwrfL0IzE/LKQ/cw509qGY=
github.com/puzpuzpuz/xsync v1.5.2/go.mod h1:K98BYhX3k1dQ2M63t1YNVDanbwUPmBCAhNmVrrxfiGg=
github.com/puzpuzpuz/xsync/v3 v3.4.0 h1:DuVBAdXuGFHv8adVXjWWZ63pJq+NRXOWVXlKDBZ+mJ4=
Expand Down
225 changes: 225 additions & 0 deletions internal/engine/ingester/deps/deps.go
Original file line number Diff line number Diff line change
@@ -0,0 +1,225 @@
// SPDX-FileCopyrightText: Copyright 2024 The Minder Authors
// SPDX-License-Identifier: Apache-2.0

// Package deps provides the deps rule data ingest engine
package deps

import (
"context"
"errors"
"fmt"

"github.com/go-git/go-billy/v5"
"github.com/go-git/go-billy/v5/helper/iofs"
"github.com/go-viper/mapstructure/v2"
scalibr "github.com/google/osv-scalibr"
"github.com/google/osv-scalibr/extractor/filesystem/list"
scalibr_fs "github.com/google/osv-scalibr/fs"
scalibr_plugin "github.com/google/osv-scalibr/plugin"
"github.com/google/uuid"
"github.com/protobom/protobom/pkg/sbom"
"github.com/rs/zerolog"
"google.golang.org/protobuf/reflect/protoreflect"

engerrors "github.com/mindersec/minder/internal/engine/errors"
pb "github.com/mindersec/minder/pkg/api/protobuf/go/minder/v1"
"github.com/mindersec/minder/pkg/engine/v1/interfaces"
"github.com/mindersec/minder/pkg/entities/v1/checkpoints"
provifv1 "github.com/mindersec/minder/pkg/providers/v1"
)

const (
// DepsRuleDataIngestType is the type of the deps rule data ingest engine
DepsRuleDataIngestType = "deps"
defaultBranch = "main"
)

// Deps is the engine for a rule type that uses deps data ingest
type Deps struct {
cfg *pb.DepsType
gitprov provifv1.Git
}

// Config is the set of parameters to the deps rule data ingest engine
type Config struct {
Branch string `json:"branch" yaml:"branch" mapstructure:"branch"`
}

// NewDepsIngester creates a new deps rule data ingest engine
func NewDepsIngester(cfg *pb.DepsType, gitprov provifv1.Git) (*Deps, error) {
if gitprov == nil {
return nil, fmt.Errorf("provider is nil")
}

if cfg == nil {
cfg = &pb.DepsType{}
}

return &Deps{
cfg: cfg,
gitprov: gitprov,
}, nil
}

// GetType returns the type of the git rule data ingest engine
func (*Deps) GetType() string {
return DepsRuleDataIngestType
}

// GetConfig returns the config for the git rule data ingest engine
func (gi *Deps) GetConfig() protoreflect.ProtoMessage {
return gi.cfg
}

// Ingest does the actual data ingestion for a rule type by cloning a git repo,
// and scanning it for dependencies with scalibr.
func (gi *Deps) Ingest(ctx context.Context, ent protoreflect.ProtoMessage, params map[string]any) (*interfaces.Result, error) {
switch entity := ent.(type) {
case *pb.Repository:
return gi.ingestRepository(ctx, entity, params)
default:
return nil, fmt.Errorf("deps is only supported for repositories")
}
}
func (gi *Deps) ingestRepository(ctx context.Context, repo *pb.Repository, params map[string]any) (*interfaces.Result, error) {
var logger = zerolog.Ctx(ctx)
userCfg := &Config{
Branch: defaultBranch,
}
if err := mapstructure.Decode(params, userCfg); err != nil {
return nil, fmt.Errorf("failed to read dependency ingester configuration from params: %w", err)
}

if repo.GetCloneUrl() == "" {
return nil, fmt.Errorf("could not get clone url")
}

branch := gi.getBranch(repo, userCfg.Branch)
logger.Info().Interface("repo", repo).Msgf("extracting dependencies from %s#%s", repo.GetCloneUrl(), branch)

// We clone to the memfs go-billy filesystem driver, which doesn't
// allow for direct access to the underlying filesystem. This is
// because we want to be able to run this in a sandboxed environment
// where we don't have access to the underlying filesystem.
r, err := gi.gitprov.Clone(ctx, repo.GetCloneUrl(), branch)
if err != nil {
if errors.Is(err, provifv1.ErrProviderGitBranchNotFound) {
return nil, fmt.Errorf("%w: %s: branch %s", engerrors.ErrEvaluationFailed,
provifv1.ErrProviderGitBranchNotFound, branch)
} else if errors.Is(err, provifv1.ErrRepositoryEmpty) {
return nil, fmt.Errorf("%w: %s", engerrors.ErrEvaluationSkipped, provifv1.ErrRepositoryEmpty)
}
return nil, err
}

wt, err := r.Worktree()
if err != nil {
return nil, fmt.Errorf("could not get worktree: %w", err)
}

deps, err := scanFs(ctx, wt.Filesystem)
if err != nil {
return nil, fmt.Errorf("could not scan filesystem: %w", err)
}

logger.Debug().Interface("deps", deps).Msgf("Scanning successful: %d nodes found", len(deps.Nodes))

head, err := r.Head()
if err != nil {
return nil, fmt.Errorf("could not get head: %w", err)
}

hsh := head.Hash()

chkpoint := checkpoints.NewCheckpointV1Now().
WithBranch(branch).
WithCommitHash(hsh.String())

return &interfaces.Result{
Object: map[string]any{
"node_list": deps,
},
Checkpoint: chkpoint,
}, nil
}

func (gi *Deps) getBranch(repo *pb.Repository, branch string) string {
// If the user has specified a branch, use that
if branch != "" {
return branch
}

// If the branch is provided in the rule-type
// configuration, use that
if gi.cfg.GetRepo().Branch != "" {
return gi.cfg.GetRepo().Branch
}
if repo.GetDefaultBranch() != "" {
return repo.GetDefaultBranch()
}

// If the branch is not provided in the rule-type
// configuration, use the default branch
return defaultBranch
}

func scanFs(ctx context.Context, memFS billy.Filesystem) (*sbom.NodeList, error) {
if memFS == nil {
return nil, fmt.Errorf("unable to scan dependencies, no active defined")
}
// have to down-cast here, because scalibr needs multiple io/fs types
wrapped, ok := iofs.New(memFS).(scalibr_fs.FS)
if !ok {
return nil, fmt.Errorf("error converting filesystem to ReadDirFS")
}

desiredCaps := scalibr_plugin.Capabilities{
OS: scalibr_plugin.OSLinux,
Network: true,
DirectFS: false,
RunningSystem: false,
}

scalibrFs := scalibr_fs.ScanRoot{FS: wrapped}
scanConfig := scalibr.ScanConfig{
ScanRoots: []*scalibr_fs.ScanRoot{&scalibrFs},
// All includes Ruby, Dotnet which we're not ready to test yet, so use the more limited Default set.
FilesystemExtractors: list.FilterByCapabilities(list.Default, &desiredCaps),
Capabilities: &desiredCaps,
}

scanner := scalibr.New()
scanResults := scanner.Scan(ctx, &scanConfig)

if scanResults == nil || scanResults.Status == nil {
return nil, fmt.Errorf("error scanning files: no results")
}
if scanResults.Status.Status != scalibr_plugin.ScanStatusSucceeded {
return nil, fmt.Errorf("error scanning files: %s", scanResults.Status)
}

res := sbom.NewNodeList()
for _, inv := range scanResults.Inventories {
node := &sbom.Node{
Type: sbom.Node_PACKAGE,
Id: uuid.New().String(),
Name: inv.Name,
Version: inv.Version,
Identifiers: map[int32]string{
int32(sbom.SoftwareIdentifierType_PURL): inv.Extractor.ToPURL(inv).String(),
// TODO: scalibr returns a _list_ of CPEs, but protobom will store one.
// use the first?
// int32(sbom.SoftwareIdentifierType_CPE23): inv.Extractor.ToCPEs(inv),
},
}
for _, l := range inv.Locations {
node.Properties = append(node.Properties, &sbom.Property{
Name: "sourceFile",
Data: l,
})
}
res.AddNode(node)
}

return res, nil
}
Loading

0 comments on commit f6afcd4

Please sign in to comment.