From 9fed83e9ef908162679275f47b4607d7db9725b9 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Tue, 21 May 2019 09:11:51 +0200 Subject: [PATCH 01/18] api: Rework client to use new g5k api version (4.0) --- api/api.go | 28 ++++++++++++++++++++++++---- 1 file changed, 24 insertions(+), 4 deletions(-) diff --git a/api/api.go b/api/api.go index e19eafe..3ef96d7 100644 --- a/api/api.go +++ b/api/api.go @@ -1,12 +1,15 @@ package api import ( + "net/url" + gopath "path" + "github.com/go-resty/resty" ) const ( - // G5kAPIFrontend is the link to the Grid'5000 API frontend - G5kAPIFrontend = "https://api.grid5000.fr/stable" + g5kAPIhostname string = "api.grid5000.fr" + g5kAPIversion string = "4.0" ) // Client is a client to the Grid'5000 REST API @@ -25,9 +28,26 @@ func NewClient(username, password, site string) *Client { } } -// Request returns a configured resty request -func (c *Client) Request() *resty.Request { +// getRequest returns a configured resty request +func (c *Client) getRequest() *resty.Request { return resty.R(). SetHeader("Accept", "application/json"). SetBasicAuth(c.Username, c.Password) } + +// getBaseURL returns the Grid'5000 API base url +func (c *Client) getBaseURL() *url.URL { + return &url.URL{ + Scheme: "https", + Host: g5kAPIhostname, + Path: gopath.Join(g5kAPIversion, "sites", c.Site), + } +} + +// getEndpoint construct and returns the API endpoint for the given api name and path +func (c *Client) getEndpoint(api string, path string, params url.Values) string { + url := c.getBaseURL() + url.Path = gopath.Join(url.Path, api, path) + url.RawQuery = params.Encode() + return url.String() +} From 6ae18e5fb99f9d80e0a55d791a00a09afa2b1717 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Wed, 22 May 2019 14:32:11 +0200 Subject: [PATCH 02/18] api: Update Job module to use the new g5k api version --- api/jobs.go | 57 ++++++++++++++--------------------------------------- 1 file changed, 15 insertions(+), 42 deletions(-) diff --git a/api/jobs.go b/api/jobs.go index b72bc2e..52464ed 100644 --- a/api/jobs.go +++ b/api/jobs.go @@ -2,9 +2,7 @@ package api import ( "fmt" - "sort" - - "github.com/docker/machine/libmachine/log" + "net/url" ) // JobRequest represents a new job submission @@ -29,92 +27,67 @@ type Job struct { // SubmitJob submit a new job on g5k api and return the job id func (c *Client) SubmitJob(jobReq JobRequest) (int, error) { - // create url for API call - url := fmt.Sprintf("%s/sites/%s/jobs", G5kAPIFrontend, c.Site) - - log.Info("Submitting a new job...") - // send job request - jobRes, err := c.Request(). + req, err := c.getRequest(). SetHeader("Content-Type", "application/json"). SetBody(jobReq). SetResult(&Job{}). - Post(url) + Post(c.getEndpoint("jobs", "/", url.Values{})) if err != nil { return 0, fmt.Errorf("Error while sending Job submission: '%s'", err) } // check HTTP error code (expected: 201 Created) - if jobRes.StatusCode() != 201 { - return 0, fmt.Errorf("The server returned an error (code: %d) after sending Job submission: '%s'", jobRes.StatusCode(), jobRes.Status()) + if req.StatusCode() != 201 { + return 0, fmt.Errorf("The server returned an error (code: %d) after sending Job submission: '%s'", req.StatusCode(), req.Status()) } // unmarshal result - job, ok := jobRes.Result().(*Job) + job, ok := req.Result().(*Job) if !ok { return 0, fmt.Errorf("Error in the response of the Job submission (unexpected type)") } - log.Infof("Job submitted successfully (id: '%v')", job.UID) return job.UID, nil } // GetJob get the job from its id func (c *Client) GetJob(jobID int) (*Job, error) { - // create url for API call - urlJob := fmt.Sprintf("%s/sites/%s/jobs/%v", G5kAPIFrontend, c.Site, jobID) - // send request - jobRes, err := c.Request(). + req, err := c.getRequest(). SetResult(&Job{}). - Get(urlJob) + Get(c.getEndpoint("jobs", fmt.Sprintf("/%v", jobID), url.Values{})) if err != nil { return nil, fmt.Errorf("Error while retrieving Job informations") } // check HTTP error code (expected: 200 OK) - if jobRes.StatusCode() != 200 { - return nil, fmt.Errorf("The server returned an error (code: %d) after requesting Job informations: '%s'", jobRes.StatusCode(), jobRes.Status()) + if req.StatusCode() != 200 { + return nil, fmt.Errorf("The server returned an error (code: %d) after requesting Job informations: '%s'", req.StatusCode(), req.Status()) } // unmarshal result - job, ok := jobRes.Result().(*Job) + job, ok := req.Result().(*Job) if !ok { return nil, fmt.Errorf("Error in the Job retrieving (unexpected type)") } - sort.Strings(job.Types) - sort.Strings(job.Nodes) return job, nil } -// GetJobState returns the current state of the job -func (c *Client) GetJobState(jobID int) (string, error) { - // get job from api - job, err := c.GetJob(jobID) - if err != nil { - return "", err - } - - return job.State, nil -} - // KillJob ask for deletion of a job func (c *Client) KillJob(jobID int) error { - // create url for API call - url := fmt.Sprintf("%s/sites/%s/jobs/%v", G5kAPIFrontend, c.Site, jobID) - // send delete request - delRes, err := c.Request().Delete(url) + req, err := c.getRequest().Delete(c.getEndpoint("jobs", fmt.Sprintf("/%v", jobID), url.Values{})) if err != nil { return fmt.Errorf("Error while killing job: '%s'", err) } - // check HTTP error code (expected: 202 Accepted) - if delRes.StatusCode() != 202 { - return fmt.Errorf("The server returned an error (code: %d) after job killing request: '%s'", delRes.StatusCode(), delRes.Status()) + // check HTTP error code (202 when accepted or 400 in case the job have already been killed) + if req.StatusCode() != 202 && req.StatusCode() != 400 { + return fmt.Errorf("The server returned an error (code: %d) after job killing request: '%s'", req.StatusCode(), req.Status()) } return nil From 8e0fab71759d0350436954cb2929bb447cef19e9 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Wed, 22 May 2019 15:12:54 +0200 Subject: [PATCH 03/18] api: Add kadeploy module to handle deployments, power and reboot operations --- api/deployment.go | 85 ------------------ api/kadeploy.go | 216 ++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 216 insertions(+), 85 deletions(-) delete mode 100644 api/deployment.go create mode 100644 api/kadeploy.go diff --git a/api/deployment.go b/api/deployment.go deleted file mode 100644 index 8718e89..0000000 --- a/api/deployment.go +++ /dev/null @@ -1,85 +0,0 @@ -package api - -import ( - "fmt" - "sort" - - "github.com/docker/machine/libmachine/log" -) - -// DeploymentRequest represents a new deployment request -type DeploymentRequest struct { - Nodes []string `json:"nodes"` - Environment string `json:"environment"` - Key string `json:"key"` -} - -// Deployment represents a deployment response -type Deployment struct { - Nodes []string `json:"nodes"` - Site string `json:"site_uid"` - Status string `json:"status"` - UID string `json:"uid"` -} - -// SubmitDeployment submits a new deployment request to g5k api -func (c *Client) SubmitDeployment(deploymentReq DeploymentRequest) (string, error) { - // create url for API call - url := fmt.Sprintf("%s/sites/%s/deployments", G5kAPIFrontend, c.Site) - - log.Infof("Submitting a new deployment... (image: '%s')", deploymentReq.Environment) - - // send deployment request - deploymentRes, err := c.Request(). - SetHeader("Content-Type", "application/json"). - SetBody(deploymentReq). - SetResult(&Deployment{}). - Post(url) - - if err != nil { - return "", fmt.Errorf("Error while sending the deployment request: '%s'", err) - } - - // check HTTP error code (expected: 201 Created) - if deploymentRes.StatusCode() != 201 { - return "", fmt.Errorf("The server returned an error (code: %d) after sending Deployment request: '%s'", deploymentRes.StatusCode(), deploymentRes.Status()) - } - - // unmarshal result - deployment, ok := deploymentRes.Result().(*Deployment) - if !ok { - return "", fmt.Errorf("Error in the response of the Deployment request (unexpected type)") - } - - log.Infof("Deployment submitted successfully (id: '%s')", deployment.UID) - return deployment.UID, nil -} - -// GetDeployment get the deployment from its id -func (c *Client) GetDeployment(deploymentID string) (*Deployment, error) { - // create url for API call - url := fmt.Sprintf("%s/sites/%s/deployments/%s", G5kAPIFrontend, c.Site, deploymentID) - - // send request - deploymentRes, err := c.Request(). - SetResult(&Deployment{}). - Get(url) - - if err != nil { - return nil, fmt.Errorf("Error while retrieving Deployment informations: '%s'", err) - } - - // check HTTP error code (expected: 200 OK) - if deploymentRes.StatusCode() != 200 { - return nil, fmt.Errorf("The server returned an error (code: %d) after requesting Deployment informations: '%s'", deploymentRes.StatusCode(), deploymentRes.Status()) - } - - // unmarshal result - deployment, ok := deploymentRes.Result().(*Deployment) - if !ok { - return nil, fmt.Errorf("Error in the Deployment retrieving (unexpected type)") - } - - sort.Strings(deployment.Nodes) - return deployment, nil -} diff --git a/api/kadeploy.go b/api/kadeploy.go new file mode 100644 index 0000000..3a65f5d --- /dev/null +++ b/api/kadeploy.go @@ -0,0 +1,216 @@ +package api + +import ( + "fmt" + "net/url" +) + +// PowerOperation stores the attributes for a Power operation +type PowerOperation struct { + Nodes []string `json:"nodes"` + Status string `json:"status"` + Level string `json:"level,omitempty"` +} + +// RebootOperation stores the attributes for a Reboot operation +type RebootOperation struct { + Kind string `json:"kind"` + Nodes []string `json:"nodes"` + Level string `json:"level,omitempty"` +} + +// DeploymentOperationEnvironment stores the attributes about the environment to be deployed by a Deployment operation +type DeploymentOperationEnvironment struct { + Kind string `json:"kind"` + User string `json:"user,omitempty"` + Name string `json:"name"` + Version string `json:"version,omitempty"` +} + +// DeploymentOperationCustomOperation stores the attributes for a custom deployment operation of a Deployment operation +type DeploymentOperationCustomOperation struct { + Action string `json:"action"` + Name string `json:"name"` + Command string `json:"command,omitempty"` +} + +// DeploymentOperation stores the attributes for a Deployment operation +type DeploymentOperation struct { + Nodes []string `json:"nodes"` + Environment DeploymentOperationEnvironment `json:"environment"` + CustomOperations map[string]map[string]map[string][]DeploymentOperationCustomOperation `json:"custom_operations,omitempty"` +} + +// OperationResponse stores the attributes of the response of the submission of an operation +type OperationResponse struct { + WID string `json:"wid"` +} + +// OperationWorkflow stores the attributes of the Workflow of an operation +type OperationWorkflow struct { + WID string `json:"wid"` + Done bool `json:"done"` + Error bool `json:"error"` + Nodes map[string][]string `json:"nodes"` // possible keys: ok, ko, processing +} + +// OperationStates stores the State attributes of each nodes concerned by a workflow +type OperationStates map[string]struct { + Macro string `json:"macro"` + Micro string `json:"micro"` + State string `json:"state"` + Out string `json:"out,omitempty"` +} + +// SubmitPowerOperation submit a power operation to the Kadeploy3 API +func (c *Client) SubmitPowerOperation(operation PowerOperation) (*OperationResponse, error) { + // send power operation to kadeploy3 API + req, err := c.getRequest(). + SetBody(operation). + SetResult(&OperationResponse{}). + Put(c.getEndpoint("internal/kadeployapi", "/power", url.Values{})) + + if err != nil { + return nil, fmt.Errorf("Error while sending the power operation: '%s'", err) + } + + // check HTTP error code (expected: 200 OK) + if req.StatusCode() != 200 { + return nil, fmt.Errorf("The server returned an error (code: %d) after sending the power operation: '%s'", req.StatusCode(), req.Status()) + } + + // unmarshal result + res, ok := req.Result().(*OperationResponse) + if !ok { + return nil, fmt.Errorf("Error in the response of the reboot submission (unexpected type)") + } + + return res, nil +} + +// RequestPowerStatus request the power status of the node to the Kadeploy3 API +func (c *Client) RequestPowerStatus(node string) (*OperationResponse, error) { + // send power operation to kadeploy3 API + req, err := c.getRequest(). + SetResult(&OperationResponse{}). + Get(c.getEndpoint("internal/kadeployapi", "/power", url.Values{"nodes": []string{node}})) + + if err != nil { + return nil, err + } + + // check HTTP error code (expected: 200 OK) + if req.StatusCode() != 200 { + return nil, fmt.Errorf("The server returned an error (code: %d) after sending the power operation: '%s'", req.StatusCode(), req.Status()) + } + + // unmarshal result + res, ok := req.Result().(*OperationResponse) + if !ok { + return nil, fmt.Errorf("Error in the response of the reboot submission (unexpected type)") + } + + return res, nil +} + +// SubmitRebootOperation submit a reboot operation to the Kadeploy3 API +func (c *Client) SubmitRebootOperation(operation RebootOperation) (*OperationResponse, error) { + // send reboot operation to kadeploy3 API + req, err := c.getRequest(). + SetBody(operation). + SetResult(&OperationResponse{}). + Post(c.getEndpoint("internal/kadeployapi", "/reboot", url.Values{})) + + if err != nil { + return nil, fmt.Errorf("Error while sending the reboot operation: '%s'", err) + } + + // check HTTP error code (expected: 200 OK) + if req.StatusCode() != 200 { + return nil, fmt.Errorf("The server returned an error (code: %d) after sending the reboot operation: '%s'", req.StatusCode(), req.Status()) + } + + // unmarshal result + res, ok := req.Result().(*OperationResponse) + if !ok { + return nil, fmt.Errorf("Error in the response of the reboot submission (unexpected type)") + } + + return res, nil +} + +// SubmitDeployment submits a new deployment request to g5k api +func (c *Client) SubmitDeployment(operation DeploymentOperation) (*OperationResponse, error) { + // send deployment request to kadeploy3 API + req, err := c.getRequest(). + SetBody(operation). + SetResult(&OperationResponse{}). + Post(c.getEndpoint("deployment", "/", url.Values{})) + + if err != nil { + return nil, fmt.Errorf("Error while sending the deployment request: '%s'", err) + } + + // check HTTP error code (expected: 200 OK) + if req.StatusCode() != 200 { + return nil, fmt.Errorf("The server returned an error (code: %d) after sending Deployment request: '%s'", req.StatusCode(), req.Status()) + } + + // unmarshal result + res, ok := req.Result().(*OperationResponse) + if !ok { + return nil, fmt.Errorf("Error in the response of the Deployment request (unexpected type)") + } + + return res, nil +} + +// GetOperationWorkflow fetch and return an operation workflow from its ID +func (c *Client) GetOperationWorkflow(operation string, wid string) (*OperationWorkflow, error) { + // get workflow fron kadeploy3 API + req, err := c.getRequest(). + SetResult(&OperationWorkflow{}). + Get(c.getEndpoint("internal/kadeployapi", fmt.Sprintf("/%s/%s", operation, wid), url.Values{})) + + if err != nil { + return nil, err + } + + // check HTTP error code (expected: 200 OK) + if req.StatusCode() != 200 { + return nil, fmt.Errorf("The server returned an error (code: %d) while fetching the operation workflow: '%s'", req.StatusCode(), req.Status()) + } + + // unmarshal result + workflow, ok := req.Result().(*OperationWorkflow) + if !ok { + return nil, fmt.Errorf("Error in the response of the operation workflow (unexpected type)") + } + + return workflow, nil +} + +// GetOperationStates fetch and return the states of an operation workflow from its ID +func (c *Client) GetOperationStates(operation string, wid string) (*OperationStates, error) { + // get workflow fron kadeploy3 API + req, err := c.getRequest(). + SetResult(&OperationStates{}). + Get(c.getEndpoint("internal/kadeployapi", fmt.Sprintf("/%s/%s/state", operation, wid), url.Values{})) + + if err != nil { + return nil, err + } + + // check HTTP error code (expected: 200 OK) + if req.StatusCode() != 200 { + return nil, fmt.Errorf("The server returned an error (code: %d) while fetching the operation states: '%s'", req.StatusCode(), req.Status()) + } + + // unmarshal result + states, ok := req.Result().(*OperationStates) + if !ok { + return nil, fmt.Errorf("Error in the response of the operation states (unexpected type)") + } + + return states, nil +} From 35a5c2eb4c6729a7fa044008313c6eb6d327e557 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Fri, 24 May 2019 17:24:10 +0200 Subject: [PATCH 04/18] driver: Add helper functions for node power status operations --- driver/g5k.go | 80 +++++++++++++++++++++++++++++++++++++++++++++++++++ 1 file changed, 80 insertions(+) diff --git a/driver/g5k.go b/driver/g5k.go index a99b496..00d8492 100644 --- a/driver/g5k.go +++ b/driver/g5k.go @@ -252,3 +252,83 @@ func (d *Driver) deployImageToNode() error { return nil } + +// getNodePowerState returns the power status of the node by querying its baseboard management controller (BMC) +func (d *Driver) getNodePowerState() (string, error) { + node, err := d.GetIP() + if err != nil { + return "", fmt.Errorf("Failed to get the node hostname: %s", err.Error()) + } + + op, err := d.G5kAPI.RequestPowerStatus(node) + if err != nil { + return "", fmt.Errorf("Failed to request power status: %s", err.Error()) + } + + if err := d.waitUntilWorkflowIsDone("power", op.WID, node); err != nil { + return "", err + } + + // get nodes states for the workflow + states, err := d.G5kAPI.GetOperationStates("power", op.WID) + if err != nil { + return "", err + } + + // get the state of the current node + state, ok := (*states)[node] + if !ok { + return "", fmt.Errorf("Failed to retrieve the workflow state of the power status operation") + } + + // extract the BMC power status from the state out attribute + re := regexp.MustCompile(`-bmc: (on|off)$`) + matches := re.FindStringSubmatch(state.Out) + if matches == nil { + return "", fmt.Errorf("The BMC status in the workflow state is invalid: %s", state.Out) + } + + return matches[1], nil +} + +// changeNodePowerStatus change the power status (on/off) of the node with the given level (soft/hard) +func (d *Driver) changeNodePowerStatus(status string, level string) error { + node, err := d.GetIP() + if err != nil { + return fmt.Errorf("Failed to get the node hostname: %s", err.Error()) + } + + op, err := d.G5kAPI.SubmitPowerOperation(api.PowerOperation{ + Nodes: []string{node}, + Status: status, + Level: level, + }) + + if err != nil { + return err + } + + log.Infof("Power-%s (%s) operation for '%s' node have been submitted successfully (workflow id: '%s')", status, level, node, op.WID) + return d.waitUntilWorkflowIsDone("power", op.WID, node) +} + +// rebootNode reboot the node with the given level (soft/hard) +func (d *Driver) rebootNode(level string) error { + node, err := d.GetIP() + if err != nil { + return fmt.Errorf("Failed to get the node hostname: %s", err.Error()) + } + + op, err := d.G5kAPI.SubmitRebootOperation(api.RebootOperation{ + Kind: "simple", + Nodes: []string{node}, + Level: level, + }) + + if err != nil { + return err + } + + log.Infof("Reboot (%s) operation for '%s' node have been submitted successfully (workflow id: '%s')", level, node, op.WID) + return d.waitUntilWorkflowIsDone("reboot", op.WID, node) +} From 9c51d47c793b508be799a1995c97b4805fa5379d Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Mon, 27 May 2019 09:47:19 +0200 Subject: [PATCH 05/18] driver: Move driver store functions to their own file --- driver/g5k.go | 57 +------------------------------------------------ driver/store.go | 57 +++++++++++++++++++++++++++++++++++++++++++++++++ 2 files changed, 58 insertions(+), 56 deletions(-) create mode 100644 driver/store.go diff --git a/driver/g5k.go b/driver/g5k.go index 00d8492..de77032 100644 --- a/driver/g5k.go +++ b/driver/g5k.go @@ -2,19 +2,11 @@ package driver import ( "fmt" - "io/ioutil" - "net" - "os" - "path/filepath" - "sort" - "strings" + "regexp" "time" - gossh "golang.org/x/crypto/ssh" - "github.com/Spirals-Team/docker-machine-driver-g5k/api" "github.com/docker/machine/libmachine/log" - "github.com/docker/machine/libmachine/ssh" ) // checkVpnConnection check if the VPN is connected and properly configured (DNS) by trying to connect to the site frontend SSH server using its hostname @@ -34,52 +26,6 @@ func (d *Driver) checkVpnConnection() error { return nil } -// resolveDriverStorePath returns the store path of the driver -func (d *Driver) resolveDriverStorePath(file string) string { - return filepath.Join(d.StorePath, "g5k", file) -} - -// prepareDriverStoreDirectory initialize the driver storage directory -func (d *Driver) prepareDriverStoreDirectory() error { - driverStoreBasePath := d.resolveDriverStorePath(".") - - // create the directory if needed - if _, err := os.Stat(driverStoreBasePath); os.IsNotExist(err) { - if err := os.Mkdir(driverStoreBasePath, 0700); err != nil { - return fmt.Errorf("Failed to create the driver storage directory: %s", err) - } - } - - return nil -} - -// getDriverSSHKeyPath returns the path leading to the driver SSH private key (append .pub to get the public key) -func (d *Driver) getDriverSSHKeyPath() string { - return d.resolveDriverStorePath("id_rsa") -} - -// loadDriverSSHPublicKey load the driver SSH Public key from the storage dir, the key will be created if needed -func (d *Driver) loadDriverSSHPublicKey() error { - driverSSHKeyPath := d.getDriverSSHKeyPath() - - // generate the driver SSH key pair if needed - if _, err := os.Stat(driverSSHKeyPath); os.IsNotExist(err) { - if err := ssh.GenerateSSHKey(driverSSHKeyPath); err != nil { - return fmt.Errorf("Failed to generate the driver ssh key: %s", err) - } - } - - // load the public key from file - sshPublicKey, err := ioutil.ReadFile(d.getDriverSSHKeyPath() + ".pub") - if err != nil { - return fmt.Errorf("Failed to load the driver ssh public key: %s", err) - } - - // store the public key for future use - d.DriverSSHPublicKey = strings.TrimSpace(string(sshPublicKey)) - return nil -} - // generateSSHAuthorizedKeys generate the SSH AuthorizedKeys composed of the driver and user defined key(s) func (d *Driver) generateSSHAuthorizedKeys() string { var authorizedKeysEntries []string @@ -96,7 +42,6 @@ func (d *Driver) generateSSHAuthorizedKeys() string { return strings.Join(authorizedKeysEntries, "\n") + "\n" } - // waitUntilJobIsReady wait until the job reach the 'running' state (no timeout) func (d *Driver) waitUntilJobIsReady() error { log.Info("Waiting for job to run...") diff --git a/driver/store.go b/driver/store.go new file mode 100644 index 0000000..099e67f --- /dev/null +++ b/driver/store.go @@ -0,0 +1,57 @@ +package driver + +import ( + "fmt" + "io/ioutil" + "os" + "path/filepath" + "strings" + + "github.com/docker/machine/libmachine/ssh" +) + +// resolveDriverStorePath returns the store path of the driver +func (d *Driver) resolveDriverStorePath(file string) string { + return filepath.Join(d.StorePath, "g5k", file) +} + +// prepareDriverStoreDirectory initialize the driver storage directory +func (d *Driver) prepareDriverStoreDirectory() error { + driverStoreBasePath := d.resolveDriverStorePath(".") + + // create the directory if needed + if _, err := os.Stat(driverStoreBasePath); os.IsNotExist(err) { + if err := os.Mkdir(driverStoreBasePath, 0700); err != nil { + return fmt.Errorf("Failed to create the driver storage directory: %s", err) + } + } + + return nil +} + +// getDriverSSHKeyPath returns the path leading to the driver SSH private key (append .pub to get the public key) +func (d *Driver) getDriverSSHKeyPath() string { + return d.resolveDriverStorePath("id_rsa") +} + +// loadDriverSSHPublicKey load the driver SSH Public key from the storage dir, the key will be created if needed +func (d *Driver) loadDriverSSHPublicKey() error { + driverSSHKeyPath := d.getDriverSSHKeyPath() + + // generate the driver SSH key pair if needed + if _, err := os.Stat(driverSSHKeyPath); os.IsNotExist(err) { + if err := ssh.GenerateSSHKey(driverSSHKeyPath); err != nil { + return fmt.Errorf("Failed to generate the driver ssh key: %s", err) + } + } + + // load the public key from file + sshPublicKey, err := ioutil.ReadFile(d.getDriverSSHKeyPath() + ".pub") + if err != nil { + return fmt.Errorf("Failed to load the driver ssh public key: %s", err) + } + + // store the public key for future use + d.DriverSSHPublicKey = strings.TrimSpace(string(sshPublicKey)) + return nil +} From dccf9fe861a9a8fdc11bd47a7230c0a891bc2f71 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Mon, 27 May 2019 14:07:26 +0200 Subject: [PATCH 06/18] driver: Move ssh authorized keys generation to util file --- driver/g5k.go | 16 ---------------- driver/utils.go | 37 +++++++++++++++++++++++++++++++++++++ 2 files changed, 37 insertions(+), 16 deletions(-) create mode 100644 driver/utils.go diff --git a/driver/g5k.go b/driver/g5k.go index de77032..e933224 100644 --- a/driver/g5k.go +++ b/driver/g5k.go @@ -26,22 +26,6 @@ func (d *Driver) checkVpnConnection() error { return nil } -// generateSSHAuthorizedKeys generate the SSH AuthorizedKeys composed of the driver and user defined key(s) -func (d *Driver) generateSSHAuthorizedKeys() string { - var authorizedKeysEntries []string - - // add driver key - authorizedKeysEntries = append(authorizedKeysEntries, "# docker-machine driver g5k - driver key") - authorizedKeysEntries = append(authorizedKeysEntries, d.DriverSSHPublicKey) - - // add external key(s) - for index, externalPubKey := range d.ExternalSSHPublicKeys { - authorizedKeysEntries = append(authorizedKeysEntries, fmt.Sprintf("# docker-machine driver g5k - additional key %d", index)) - authorizedKeysEntries = append(authorizedKeysEntries, strings.TrimSpace(externalPubKey)) - } - - return strings.Join(authorizedKeysEntries, "\n") + "\n" -} // waitUntilJobIsReady wait until the job reach the 'running' state (no timeout) func (d *Driver) waitUntilJobIsReady() error { log.Info("Waiting for job to run...") diff --git a/driver/utils.go b/driver/utils.go new file mode 100644 index 0000000..e29a805 --- /dev/null +++ b/driver/utils.go @@ -0,0 +1,37 @@ +package driver + +import ( + "fmt" + "net" + "strings" + "time" + + "golang.org/x/crypto/ssh" +) + +// ArrayContainsString check if the given string array contains the given string +func ArrayContainsString(array []string, str string) bool { + for _, v := range array { + if v == str { + return true + } + } + return false +} + +// GenerateSSHAuthorizedKeys generate the SSH AuthorizedKeys composed of the driver and external user defined key(s) +func GenerateSSHAuthorizedKeys(driverKey string, externalKeys []string) string { + var authorizedKeysEntries []string + + // add driver key + authorizedKeysEntries = append(authorizedKeysEntries, "# docker-machine driver g5k - driver key") + authorizedKeysEntries = append(authorizedKeysEntries, driverKey) + + // add external key(s) + for index, externalPubKey := range externalKeys { + authorizedKeysEntries = append(authorizedKeysEntries, fmt.Sprintf("# docker-machine driver g5k - additional key %d", index)) + authorizedKeysEntries = append(authorizedKeysEntries, strings.TrimSpace(externalPubKey)) + } + + return strings.Join(authorizedKeysEntries, "\n") + "\n" +} From a30af0addd7df8a99e633e111d28d7017c489edb Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Tue, 28 May 2019 16:22:14 +0200 Subject: [PATCH 07/18] driver: Move ssh connection check function to util file --- driver/g5k.go | 15 ++++----------- driver/utils.go | 13 +++++++++++++ 2 files changed, 17 insertions(+), 11 deletions(-) diff --git a/driver/g5k.go b/driver/g5k.go index e933224..4e7b5df 100644 --- a/driver/g5k.go +++ b/driver/g5k.go @@ -9,17 +9,10 @@ import ( "github.com/docker/machine/libmachine/log" ) -// checkVpnConnection check if the VPN is connected and properly configured (DNS) by trying to connect to the site frontend SSH server using its hostname -func (d *Driver) checkVpnConnection() error { - // construct site frontend hostname - frontend := fmt.Sprintf("frontend.%s.grid5000.fr:22", d.G5kSite) - - // try to connect to the frontend SSH server - sshConfig := &gossh.ClientConfig{} - _, err := gossh.Dial("tcp", frontend, sshConfig) - - // we need to check if the error is network-related because the SSH Dial will always return an error due to the Authentication being not configured - if _, ok := err.(*net.OpError); ok { +func (d *Driver) checkVpnConfiguration() error { + // Check VPN connection by trying to connect to the ssh server of the frontend of the current site. + // This allows to test if the user use the VPN and the Grid'5000 DNS servers. + if err := CheckSSHConnection(fmt.Sprintf("frontend.%s.grid5000.fr", d.G5kSite)); err != nil { return fmt.Errorf("Connection to frontend of '%s' site failed. Please check if the site is not undergoing maintenance and your VPN client is connected and properly configured (see driver documentation for more information)", d.G5kSite) } diff --git a/driver/utils.go b/driver/utils.go index e29a805..0ba9d48 100644 --- a/driver/utils.go +++ b/driver/utils.go @@ -35,3 +35,16 @@ func GenerateSSHAuthorizedKeys(driverKey string, externalKeys []string) string { return strings.Join(authorizedKeysEntries, "\n") + "\n" } + +// CheckSSHConnection will try a SSH connection to the given hostname +func CheckSSHConnection(hostname string) error { + _, err := ssh.Dial("tcp", net.JoinHostPort(hostname, "22"), &ssh.ClientConfig{Timeout: time.Second * 2}) + + // we need to check if the error is network-related because the SSH Dial will always return an error due to the Authentication being not configured + if _, ok := err.(*net.OpError); ok { + return fmt.Errorf("Failed to connect to the SSH server on the node '%s' using port 22", hostname) + } + + // ignore other errors because the ssh Dial will always return an error as there is no auth method configured + return nil +} From c2c7988ffc4050b936a2031aa075d5072245814c Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Tue, 28 May 2019 17:57:06 +0200 Subject: [PATCH 08/18] driver: Rework deployment wait function to support all workflow types --- driver/g5k.go | 34 ++++++++++++++++++++++------------ 1 file changed, 22 insertions(+), 12 deletions(-) diff --git a/driver/g5k.go b/driver/g5k.go index 4e7b5df..a7cbbbd 100644 --- a/driver/g5k.go +++ b/driver/g5k.go @@ -100,27 +100,37 @@ func (d *Driver) makeJobReservation() error { return nil } -// waitUntilDeploymentIsFinished will wait until the deployment reach the 'terminated' state (no timeout) -func (d *Driver) waitUntilDeploymentIsFinished(deploymentID string) error { - log.Info("Waiting for deployment to finish, it will take a few minutes...") +// waitUntilWorkflowIsDone will wait until the workflow for the given operation is done (successfully or not) for the node +func (d *Driver) waitUntilWorkflowIsDone(operation string, wid string, node string) error { + log.Infof("Waiting for workflow of '%s' operation to finish, it will take a few minutes...", operation) - // refresh deployment status - for deployment, err := d.G5kAPI.GetDeployment(deploymentID); deployment.Status != "terminated"; deployment, err = d.G5kAPI.GetDeployment(deploymentID) { - // check if GetDeployment returned an error + for { + // get operation workflow + workflow, err := d.G5kAPI.GetOperationWorkflow(operation, wid) if err != nil { return err } - // stop if the deployment is in 'canceled' or 'error' state - if deployment.Status == "canceled" || deployment.Status == "error" { - return fmt.Errorf("Can't wait for a deployment in '%s' state", deployment.Status) + // check if the workflow is done for the node + if ArrayContainsString(workflow.Nodes["ok"], node) { + break } - // wait 10 seconds before making another API call - time.Sleep(10 * time.Second) + // check if the workflow failed for the node + if ArrayContainsString(workflow.Nodes["ko"], node) { + return fmt.Errorf("Workflow for '%s' operation failed for the '%s' node", operation, node) + } + + // check if the workflow is processing the node + if ArrayContainsString(workflow.Nodes["processing"], node) { + log.Debugf("Workflow for '%s' operation is in processing state for the '%s' node", operation, node) + } + + // wait before making another API call + time.Sleep(7 * time.Second) } - log.Info("Deployment finished successfully") + log.Infof("Workflow for '%s' operation finished successfully for the '%s' node", operation, node) return nil } From d2001c18054007f3131e287347cd5a96214810b8 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Wed, 29 May 2019 09:32:09 +0200 Subject: [PATCH 09/18] driver: Update node image deployment function to use kadeploy api --- driver/g5k.go | 62 ++++++++++++++++++++++++++++++++++----------------- 1 file changed, 41 insertions(+), 21 deletions(-) diff --git a/driver/g5k.go b/driver/g5k.go index a7cbbbd..330811c 100644 --- a/driver/g5k.go +++ b/driver/g5k.go @@ -1,6 +1,7 @@ package driver import ( + "encoding/base64" "fmt" "regexp" "time" @@ -58,8 +59,10 @@ func (d *Driver) makeJobSubmission() error { if d.G5kReuseRefEnvironment { // remove the 'deploy' job type because we will not deploy the machine jobTypes = []string{} + // convert the ssh authorized_keys to be added in base64 + sshAuthorizedKeysBase64 := base64.StdEncoding.EncodeToString([]byte(GenerateSSHAuthorizedKeys(d.DriverSSHPublicKey, d.ExternalSSHPublicKeys))) // enable sudo for current user, add public key to ssh authorized keys for root user and wait the end of the job - jobCommand = `sudo-g5k && echo -n "` + d.generateSSHAuthorizedKeys() + `" |sudo tee -a /root/.ssh/authorized_keys >/dev/null && sleep 365d` + jobCommand = fmt.Sprint(`sudo-g5k && printf ` + sshAuthorizedKeysBase64 + ` |base64 -d |sudo tee -a /root/.ssh/authorized_keys >/dev/null && sleep 365d`) } // submit new Job request @@ -74,6 +77,7 @@ func (d *Driver) makeJobSubmission() error { return fmt.Errorf("Error when submitting new job: %s", err.Error()) } + log.Infof("Job submission have been successfully submitted. (job id: %d)", jobID) d.G5kJobID = jobID return nil } @@ -96,6 +100,7 @@ func (d *Driver) makeJobReservation() error { return fmt.Errorf("Error when submitting new job: %s", err.Error()) } + log.Infof("Job reservation have been successfully submitted. (job id: %d)", jobID) d.G5kJobID = jobID return nil } @@ -134,18 +139,11 @@ func (d *Driver) waitUntilWorkflowIsDone(operation string, wid string, node stri return nil } -// handleDeploymentError deallocate the resources when the deployment fail -func (d *Driver) handleDeploymentError() { - // if deployment fail, we can't recover from this error, so we kill the job - log.Infof("Unrecoverable error in deployment, killing job ID '%d'...", d.G5kJobID) - d.G5kAPI.KillJob(d.G5kJobID) -} - // deployImageToNode start the deployment of an OS image to a node func (d *Driver) deployImageToNode() error { // if the user want to reuse Grid'5000 reference environment if d.G5kReuseRefEnvironment { - log.Infof("Skipping host deployment and reusing Grid'5000 standard environment") + log.Infof("Skipping image deployment and reusing Grid'5000 standard environment") return nil } @@ -156,29 +154,51 @@ func (d *Driver) deployImageToNode() error { } // check job type before deploying - if sort.SearchStrings(job.Types, "deploy") != 0 { + if !ArrayContainsString(job.Types, "deploy") { return fmt.Errorf("The job (id: %d) needs to have the type 'deploy'", d.G5kJobID) } - // check if there is only one node for this reservation - if len(job.Nodes) != 1 { - return fmt.Errorf("The job (id: '%d') needs to have only one node instead of %d", d.G5kJobID, len(job.Nodes)) + // get the hostname of the node + node, err := d.GetIP() + if err != nil { + return fmt.Errorf("Failed to get the node hostname: %s", err.Error()) } - // deploy environment - deploymentID, err := d.G5kAPI.SubmitDeployment(api.DeploymentRequest{ - Nodes: job.Nodes, - Environment: d.G5kImage, - Key: d.generateSSHAuthorizedKeys(), + log.Infof("Submitting a new deployment for node '%s'... (image: '%s')", node, d.G5kImage) + + // convert the ssh authorized_keys to be added in base64 + sshAuthorizedKeysBase64 := base64.StdEncoding.EncodeToString([]byte(GenerateSSHAuthorizedKeys(d.DriverSSHPublicKey, d.ExternalSSHPublicKeys))) + + // submit deployment operation to kadeploy + op, err := d.G5kAPI.SubmitDeployment(api.DeploymentOperation{ + Nodes: []string{node}, + Environment: api.DeploymentOperationEnvironment{ + Kind: "database", + Name: d.G5kImage, + }, + CustomOperations: map[string]map[string]map[string][]api.DeploymentOperationCustomOperation{ + "BroadcastEnvKascade": { + "manage_user_post_install": { + "post-ops": { + api.DeploymentOperationCustomOperation{ + Name: "docker_machine_driver_ssh_root_pub_keys", + Action: "exec", + Command: fmt.Sprint(`printf ` + sshAuthorizedKeysBase64 + ` |base64 -d >> $KADEPLOY_ENV_EXTRACTION_DIR/root/.ssh/authorized_keys`), + }, + }, + }, + }, + }, }) + if err != nil { - d.handleDeploymentError() return fmt.Errorf("Error when submitting new deployment: %s", err.Error()) } + log.Infof("Deployment operation for '%s' node have been submitted successfully (workflow id: '%s')", node, op.WID) + // waiting deployment to finish (REQUIRED or you will interfere with kadeploy) - if err = d.waitUntilDeploymentIsFinished(deploymentID); err != nil { - d.handleDeploymentError() + if err = d.waitUntilWorkflowIsDone("deployment", op.WID, node); err != nil { return fmt.Errorf("Error when waiting for deployment to finish: %s", err.Error()) } From c8345f5cbf9004e48106905a04bfd9acd92d79a8 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Thu, 30 May 2019 15:55:43 +0200 Subject: [PATCH 10/18] driver: The GetIP function automatically set the node IPAddress if needed --- driver/driver.go | 17 +++++++++++++++-- 1 file changed, 15 insertions(+), 2 deletions(-) diff --git a/driver/driver.go b/driver/driver.go index 8da5250..0062cfa 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -201,9 +201,22 @@ func (d *Driver) SetConfigFromFlags(opts drivers.DriverOptions) error { return nil } -// GetIP returns the ip +// GetIP returns an IP or hostname that this host is available at func (d *Driver) GetIP() (string, error) { - return d.BaseDriver.GetIP() + if d.IPAddress == "" { + job, err := d.G5kAPI.GetJob(d.G5kJobID) + if err != nil { + return "", err + } + + if len(job.Nodes) == 0 { + return "", fmt.Errorf("Failed to resolve IP address: The node have not been allocated") + } + + d.IPAddress = job.Nodes[0] + } + + return d.IPAddress, nil } // GetMachineName returns the machine name From 7242525bfa4ca225080e4a7b8b04cc9cfd6026df Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Thu, 30 May 2019 16:07:21 +0200 Subject: [PATCH 11/18] driver: The GetURL function will return an error when called on a stopped machine --- driver/driver.go | 15 +++++++++++---- 1 file changed, 11 insertions(+), 4 deletions(-) diff --git a/driver/driver.go b/driver/driver.go index 0062cfa..a7f0ff0 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -244,16 +244,23 @@ func (d *Driver) GetSSHUsername() string { return d.BaseDriver.GetSSHUsername() } -// GetURL returns the URL of the docker daemon +// GetURL returns a Docker compatible host URL for connecting to this host func (d *Driver) GetURL() (string, error) { - // get IP address + if err := drivers.MustBeRunning(d); err != nil { + return "", err + } + ip, err := d.GetIP() if err != nil { return "", err } - // format URL 'tcp://host:2376' - return fmt.Sprintf("tcp://%s", net.JoinHostPort(ip, "2376")), nil + u := url.URL{ + Scheme: "tcp", + Host: net.JoinHostPort(ip, "2376"), + } + + return u.String(), nil } // GetState returns the state of the node From f26c0e7a4d2151895a81ba0ce99f355e9c9989d0 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Fri, 31 May 2019 12:14:39 +0200 Subject: [PATCH 12/18] driver: Rework GetState function to return the real state of the machine --- driver/driver.go | 34 ++++++++++++++++++++++++++-------- 1 file changed, 26 insertions(+), 8 deletions(-) diff --git a/driver/driver.go b/driver/driver.go index a7f0ff0..fc2ba61 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -263,30 +263,48 @@ func (d *Driver) GetURL() (string, error) { return u.String(), nil } -// GetState returns the state of the node +// GetState returns the state that the host is in (running, stopped, etc) func (d *Driver) GetState() (state.State, error) { - // get job state from API - status, err := d.G5kAPI.GetJobState(d.G5kJobID) + job, err := d.G5kAPI.GetJob(d.G5kJobID) if err != nil { - return state.Error, err + return state.None, err } - switch status { + // filter job status where the node is not available + switch job.State { case "waiting": return state.Starting, nil case "launching": return state.Starting, nil - case "running": - return state.Running, nil case "hold": return state.Stopped, nil case "error": return state.Error, nil case "terminated": return state.Stopped, nil + case "running": + // noop, needs further checks default: - return state.None, nil + return state.None, fmt.Errorf("The job (id: %v) is in an unexpected state: %s", job.UID, job.State) } + + // Try to connect to the site frontend ssh server before continuing. + // This prevent to wrongly report the machine as Stopped when the user is disconnected from the VPN. + if err := d.checkVpnConfiguration(); err != nil { + return state.None, err + } + + ip, err := d.GetIP() + if err != nil { + return state.None, err + } + + // Try to connect to the node ssh server + if err := CheckSSHConnection(ip); err != nil { + return state.Stopped, nil + } + + return state.Running, nil } // PreCreateCheck check parameters and submit the job to Grid5000 From 7ca9dc111f7dbfdf26e265cf2aa9258c0f8540e3 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Fri, 31 May 2019 13:04:19 +0200 Subject: [PATCH 13/18] driver: Let the machine be created even if the job is not started to allow user to kill a job in any state --- driver/driver.go | 21 +++++---------------- 1 file changed, 5 insertions(+), 16 deletions(-) diff --git a/driver/driver.go b/driver/driver.go index fc2ba61..05b6e57 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -309,16 +309,13 @@ func (d *Driver) GetState() (state.State, error) { // PreCreateCheck check parameters and submit the job to Grid5000 func (d *Driver) PreCreateCheck() error { - // prepare the driver store dir if err := d.prepareDriverStoreDirectory(); err != nil { return err } - // check VPN connection if enabled - if !d.G5kSkipVpnChecks { - if err := d.checkVpnConnection(); err != nil { - return err - } + // check if the user is connected to the Grid'5000 VPN and its configuration is valid + if err := d.checkVpnConfiguration(); err != nil { + return err } // create API client @@ -355,24 +352,16 @@ func (d *Driver) PreCreateCheck() error { } } - // wait for job to be in 'running' state - if err := d.waitUntilJobIsReady(); err != nil { - return err - } - return nil } // Create wait for the job to be running, deploy the OS image and copy the ssh keys func (d *Driver) Create() error { - // get node hostname from API - job, err := d.G5kAPI.GetJob(d.G5kJobID) - if err != nil { + // wait for job to be in 'running' state + if err := d.waitUntilJobIsReady(); err != nil { return err } - d.BaseDriver.IPAddress = job.Nodes[0] - // deploy OS image to the node if err := d.deployImageToNode(); err != nil { return err } From f651cd8e99d72366db47ad7c03c28426d550e226 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Fri, 31 May 2019 13:47:02 +0200 Subject: [PATCH 14/18] driver: Add support for start/stop/restart/kill machine operations --- driver/driver.go | 29 +++++++++++++++-------------- 1 file changed, 15 insertions(+), 14 deletions(-) diff --git a/driver/driver.go b/driver/driver.go index 05b6e57..2326b4b 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -3,6 +3,7 @@ package driver import ( "fmt" "net" + "net/url" "github.com/docker/machine/libmachine/mcnutils" @@ -224,22 +225,22 @@ func (d *Driver) GetMachineName() string { return d.BaseDriver.GetMachineName() } -// GetSSHHostname returns the machine hostname +// GetSSHHostname returns hostname for use with ssh func (d *Driver) GetSSHHostname() (string, error) { return d.GetIP() } -// GetSSHKeyPath returns the ssh private key path +// GetSSHKeyPath returns key path for use with ssh func (d *Driver) GetSSHKeyPath() string { return d.BaseDriver.GetSSHKeyPath() } -// GetSSHPort returns the ssh port +// GetSSHPort returns port for use with ssh func (d *Driver) GetSSHPort() (int, error) { return d.BaseDriver.GetSSHPort() } -// GetSSHUsername returns the ssh user name +// GetSSHUsername returns username for use with ssh func (d *Driver) GetSSHUsername() string { return d.BaseDriver.GetSSHUsername() } @@ -381,29 +382,29 @@ func (d *Driver) Create() error { func (d *Driver) Remove() error { // keep the resource allocated if the user asked for it if !d.G5kKeepAllocatedResourceAtDeletion { - log.Infof("Killing job... (id: '%d')", d.G5kJobID) - d.G5kAPI.KillJob(d.G5kJobID) + log.Infof("Deallocating resource... (Job ID: '%d')", d.G5kJobID) + return d.G5kAPI.KillJob(d.G5kJobID) } return nil } -// Kill don't do anything +// Kill perform a hard power-off on the node func (d *Driver) Kill() error { - return fmt.Errorf("The 'kill' operation is not supported on Grid'5000") + return d.changeNodePowerStatus("off", "hard") } -// Start don't do anything +// Start perform a soft power-on on the node func (d *Driver) Start() error { - return fmt.Errorf("The 'start' operation is not supported on Grid'5000") + return d.changeNodePowerStatus("on", "soft") } -// Stop don't do anything +// Stop perform a soft power-off on the node func (d *Driver) Stop() error { - return fmt.Errorf("The 'stop' operation is not supported on Grid'5000") + return d.changeNodePowerStatus("off", "soft") } -// Restart don't do anything +// Restart perform a soft reboot on the node func (d *Driver) Restart() error { - return fmt.Errorf("The 'restart' operation is not supported on Grid'5000") + return d.rebootNode("soft") } From d2cb0302c455f434c9fcd4f479d000ba6679e5be Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Mon, 3 Jun 2019 13:36:40 +0200 Subject: [PATCH 15/18] driver: Remove cli flag to skip VPN configuration checks --- driver/driver.go | 13 ------------- 1 file changed, 13 deletions(-) diff --git a/driver/driver.go b/driver/driver.go index 2326b4b..0792508 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -31,7 +31,6 @@ type Driver struct { G5kWalltime string G5kImage string G5kResourceProperties string - G5kSkipVpnChecks bool G5kReuseRefEnvironment bool G5kJobQueue string G5kJobStartTime string @@ -99,12 +98,6 @@ func (d *Driver) GetCreateFlags() []mcnflag.Flag { Usage: "Resource selection with OAR properties (SQL format)", }, - mcnflag.BoolFlag{ - EnvVar: "G5K_SKIP_VPN_CHECKS", - Name: "g5k-skip-vpn-checks", - Usage: "Skip the VPN client connection and DNS configuration checks (for specific use case only, you should not enable this flag in normal use)", - }, - mcnflag.BoolFlag{ EnvVar: "G5K_REUSE_REF_ENVIRONMENT", Name: "g5k-reuse-ref-environment", @@ -153,7 +146,6 @@ func (d *Driver) SetConfigFromFlags(opts drivers.DriverOptions) error { d.G5kWalltime = opts.String("g5k-walltime") d.G5kImage = opts.String("g5k-image") d.G5kResourceProperties = opts.String("g5k-resource-properties") - d.G5kSkipVpnChecks = opts.Bool("g5k-skip-vpn-checks") d.G5kReuseRefEnvironment = opts.Bool("g5k-reuse-ref-environment") d.G5kJobQueue = opts.String("g5k-job-queue") d.G5kJobStartTime = opts.String("g5k-make-resource-reservation") @@ -189,11 +181,6 @@ func (d *Driver) SetConfigFromFlags(opts drivers.DriverOptions) error { return fmt.Errorf("Reusing the Grid'5000 reference environment on a resource reservation is not supported") } - // warn if user disable VPN check - if d.G5kSkipVpnChecks { - log.Warn("VPN client connection and DNS configuration checks are disabled") - } - // we cannot use the besteffort queue with docker-machine if d.G5kJobQueue == "besteffort" { return fmt.Errorf("The besteffort queue is not supported") From 774017fecda5bd1518821900a7a723f62b67dadf Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Tue, 4 Jun 2019 09:45:50 +0200 Subject: [PATCH 16/18] driver: Add cli flag to select the node to use in a job reservation --- driver/driver.go | 64 +++++++++++++++++++++++++++++------------------- driver/g5k.go | 5 ++++ 2 files changed, 44 insertions(+), 25 deletions(-) diff --git a/driver/driver.go b/driver/driver.go index 0792508..5420182 100644 --- a/driver/driver.go +++ b/driver/driver.go @@ -37,6 +37,7 @@ type Driver struct { DriverSSHPublicKey string ExternalSSHPublicKeys []string G5kKeepAllocatedResourceAtDeletion bool + G5kNodeHostname string } // NewDriver creates and returns a new instance of the driver @@ -123,6 +124,12 @@ func (d *Driver) GetCreateFlags() []mcnflag.Flag { Usage: "Use a resource reservation (need to be a job of 'deploy' type and in the 'running' state)", }, + mcnflag.StringFlag{ + EnvVar: "G5K_SELECT_NODE_FROM_RESERVATION", + Name: "g5k-select-node-from-reservation", + Usage: "Hostname of the node to use from the reservation. (SHOULD be in the allocated node(s) of the resource reservation)", + }, + mcnflag.StringSliceFlag{ EnvVar: "G5K_EXTERNAL_SSH_PUBLIC_KEYS", Name: "g5k-external-ssh-public-keys", @@ -140,6 +147,7 @@ func (d *Driver) GetCreateFlags() []mcnflag.Flag { // SetConfigFromFlags configure the driver from the command line arguments func (d *Driver) SetConfigFromFlags(opts drivers.DriverOptions) error { + d.BaseDriver.SetSwarmConfigFromFlags(opts) d.G5kUsername = opts.String("g5k-username") d.G5kPassword = opts.String("g5k-password") d.G5kSite = opts.String("g5k-site") @@ -152,38 +160,40 @@ func (d *Driver) SetConfigFromFlags(opts drivers.DriverOptions) error { d.G5kJobID = opts.Int("g5k-use-resource-reservation") d.ExternalSSHPublicKeys = opts.StringSlice("g5k-external-ssh-public-keys") d.G5kKeepAllocatedResourceAtDeletion = opts.Bool("g5k-keep-resource-at-deletion") + d.G5kNodeHostname = opts.String("g5k-select-node-from-reservation") - // Docker Swarm - d.BaseDriver.SetSwarmConfigFromFlags(opts) - - // username is required if d.G5kUsername == "" { return fmt.Errorf("You must give your Grid5000 account username") } - - // password is required if d.G5kPassword == "" { return fmt.Errorf("You must give your Grid5000 account password") } - - // site is required if d.G5kSite == "" { return fmt.Errorf("You must give the site you want to reserve the resources on") } - // contradictory use of parameters: providing an image to deploy while trying to reuse the reference environment - if d.G5kReuseRefEnvironment && d.G5kImage != g5kReferenceEnvironmentName { - return fmt.Errorf("You have to choose between reusing the reference environment or redeploying the node with another image") + // The besteffort queue is only for interruptible jobs and cannot be used in the case of Docker machine + if d.G5kJobQueue == "besteffort" { + return fmt.Errorf("The besteffort queue is not supported") } - // we cannot reuse the reference environment when the job is of type 'deploy' - if d.G5kReuseRefEnvironment && (d.G5kJobStartTime != "" || d.G5kJobID != 0) { - return fmt.Errorf("Reusing the Grid'5000 reference environment on a resource reservation is not supported") + if d.G5kReuseRefEnvironment { + // Contradictory use of parameters: providing an image to deploy while trying to reuse the reference environment + if d.G5kImage != g5kReferenceEnvironmentName { + return fmt.Errorf("You have to choose between reusing the reference environment or redeploying the node with another image") + } + + // Reusing the reference environment is only possible when the job is NOT of type 'deploy' + if d.G5kJobStartTime != "" || d.G5kJobID != 0 { + return fmt.Errorf("Reusing the Grid'5000 reference environment on a resource reservation is not supported") + } } - // we cannot use the besteffort queue with docker-machine - if d.G5kJobQueue == "besteffort" { - return fmt.Errorf("The besteffort queue is not supported") + if d.G5kNodeHostname != "" { + // Node selection flag can only be used on a resource reservation because there will be only one node in a submission. + if d.G5kJobID == 0 { + return fmt.Errorf("You cannot select a node when doing a job submission") + } } return nil @@ -192,16 +202,20 @@ func (d *Driver) SetConfigFromFlags(opts drivers.DriverOptions) error { // GetIP returns an IP or hostname that this host is available at func (d *Driver) GetIP() (string, error) { if d.IPAddress == "" { - job, err := d.G5kAPI.GetJob(d.G5kJobID) - if err != nil { - return "", err - } + if d.G5kNodeHostname == "" { + job, err := d.G5kAPI.GetJob(d.G5kJobID) + if err != nil { + return "", err + } + + if len(job.Nodes) == 0 { + return "", fmt.Errorf("Failed to resolve IP address: The node have not been allocated") + } - if len(job.Nodes) == 0 { - return "", fmt.Errorf("Failed to resolve IP address: The node have not been allocated") + d.G5kNodeHostname = job.Nodes[0] } - d.IPAddress = job.Nodes[0] + d.IPAddress = d.G5kNodeHostname } return d.IPAddress, nil @@ -273,7 +287,7 @@ func (d *Driver) GetState() (state.State, error) { case "running": // noop, needs further checks default: - return state.None, fmt.Errorf("The job (id: %v) is in an unexpected state: %s", job.UID, job.State) + return state.None, fmt.Errorf("The job is in an unexpected state: %s", job.State) } // Try to connect to the site frontend ssh server before continuing. diff --git a/driver/g5k.go b/driver/g5k.go index 330811c..d383e03 100644 --- a/driver/g5k.go +++ b/driver/g5k.go @@ -164,6 +164,11 @@ func (d *Driver) deployImageToNode() error { return fmt.Errorf("Failed to get the node hostname: %s", err.Error()) } + // check if the node is allocated to the job + if !ArrayContainsString(job.Nodes, node) { + return fmt.Errorf("The node '%s' is not allocated to the job (id: %d)", node, d.G5kJobID) + } + log.Infof("Submitting a new deployment for node '%s'... (image: '%s')", node, d.G5kImage) // convert the ssh authorized_keys to be added in base64 From fceeeb9513517dc602f9e99b30a523e3acf8db90 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Tue, 4 Jun 2019 11:53:08 +0200 Subject: [PATCH 17/18] driver: Prevent sending power/reboot operations to the node when the user reuse the g5k environment --- driver/g5k.go | 8 ++++++++ 1 file changed, 8 insertions(+) diff --git a/driver/g5k.go b/driver/g5k.go index d383e03..31efa47 100644 --- a/driver/g5k.go +++ b/driver/g5k.go @@ -250,6 +250,10 @@ func (d *Driver) getNodePowerState() (string, error) { // changeNodePowerStatus change the power status (on/off) of the node with the given level (soft/hard) func (d *Driver) changeNodePowerStatus(status string, level string) error { + if d.G5kReuseRefEnvironment { + return fmt.Errorf("You can't power-%s (%s) the node when reusing the Grid'5000 environment", status, level) + } + node, err := d.GetIP() if err != nil { return fmt.Errorf("Failed to get the node hostname: %s", err.Error()) @@ -271,6 +275,10 @@ func (d *Driver) changeNodePowerStatus(status string, level string) error { // rebootNode reboot the node with the given level (soft/hard) func (d *Driver) rebootNode(level string) error { + if d.G5kReuseRefEnvironment { + return fmt.Errorf("You can't reboot (%s) the node when reusing the Grid'5000 environment", level) + } + node, err := d.GetIP() if err != nil { return fmt.Errorf("Failed to get the node hostname: %s", err.Error()) From 005203a7122f599cdb18378ed141080bbaf73823 Mon Sep 17 00:00:00 2001 From: Guillaume Fieni Date: Wed, 5 Jun 2019 08:51:02 +0200 Subject: [PATCH 18/18] docs: Update README - Change CircleCI badge - Add GH release badge - Add information about resource reservation flags - Fix typos in examples - Update project description - Remove reference to CPU architectures for binaries --- README.md | 98 +++++++++++++++++++++++++++++++------------------------ 1 file changed, 55 insertions(+), 43 deletions(-) diff --git a/README.md b/README.md index d58f685..9a45015 100644 --- a/README.md +++ b/README.md @@ -1,29 +1,30 @@ -[![CircleCI](https://circleci.com/gh/Spirals-Team/docker-machine-driver-g5k.svg?style=svg)](https://circleci.com/gh/Spirals-Team/docker-machine-driver-g5k) +[![CircleCI](https://circleci.com/gh/Spirals-Team/docker-machine-driver-g5k.svg?style=shield)](https://circleci.com/gh/Spirals-Team/docker-machine-driver-g5k) +[![GitHub release](https://img.shields.io/github/release/Spirals-Team/docker-machine-driver-g5k.svg)](https://github.com/Spirals-Team/docker-machine-driver-g5k/releases) [![License](https://img.shields.io/github/license/Spirals-Team/docker-machine-driver-g5k.svg)](https://opensource.org/licenses/Apache-2.0) # docker-machine-driver-g5k -A Docker Machine driver for the Grid5000 testbed infrastructure. It can be used to provision a Docker machine on a node of the Grid5000 infrastructure. +[Docker Machine](https://docs.docker.com/machine/install-machine) driver for the [Grid'5000 testbed infrastructure](https://www.grid5000.fr/). ## Requirements * [Docker](https://www.docker.com/products/overview#/install_the_platform) * [Docker Machine](https://docs.docker.com/machine/install-machine) -* [Go tools (Only for installation from sources)](https://golang.org/doc/install) +* [Go tools (only for installation from sources)](https://golang.org/doc/install) -You need a Grid5000 account to use this driver. See [this page](https://www.grid5000.fr/mediawiki/index.php/Grid5000:Get_an_account) to create an account. +You need a Grid'5000 account to use this driver. See [this page](https://www.grid5000.fr/mediawiki/index.php/Grid5000:Get_an_account) to create an account. ## VPN -**You need to be connected to the Grid5000 VPN to create and access your Docker node.** +**You need to be connected to the Grid'5000 VPN to create and access your Docker node.** **Do not forget to configure your DNS or use OpenVPN DNS auto-configuration.** -**Please follow the instructions from the [Grid5000 Wiki](https://www.grid5000.fr/mediawiki/index.php/VPN).** +**Please follow the instructions from the [Grid'5000 Wiki](https://www.grid5000.fr/mediawiki/index.php/VPN).** ## Installation from GitHub releases -Binary releases for Linux, MacOS and Windows using x86/x86_64 CPU architectures are available in the [releases page](https://github.com/Spirals-Team/docker-machine-driver-g5k/releases). +Binary releases for Linux, MacOS and Windows are available in the [releases page](https://github.com/Spirals-Team/docker-machine-driver-g5k/releases). On Linux and MacOS, you can use the following commands to install or upgrade the driver: ```bash -# download the binary for your OS and CPU architecture : +# download the binary for your OS and CPU architecture: sudo curl -L -o /usr/local/bin/docker-machine-driver-g5k "" -# grant execution rigths to the driver for everyone : +# grant execution rigths to the driver for everyone: sudo chmod +x /usr/local/bin/docker-machine-driver-g5k ``` @@ -45,36 +46,36 @@ export PATH=$PATH:$GOPATH/bin ### Driver-specific command line flags #### Flags description -* **`--g5k-username` : Your Grid5000 account username (required)** -* **`--g5k-password` : Your Grid5000 account password (required)** +* **`--g5k-username` : Your Grid'5000 account username (required)** +* **`--g5k-password` : Your Grid'5000 account password (required)** * **`--g5k-site` : Site where the reservation of the node will be made (required)** -* `--g5k-walltime` : Duration of the resource reservation (in "HH:MM:SS" format) +* `--g5k-walltime` : Duration of the resource reservation (in `HH:MM:SS` format) * `--g5k-image` : Name of the system image to deploy on the node * `--g5k-resource-properties` : [Resource selection with OAR properties](#resource-properties) * `--g5k-make-resource-reservation` : [Make a resource reservation for the given start date](#resource-reservation) * `--g5k-use-resource-reservation` : [Use a resource reservation (need to be an existing job ID)](#resource-reservation) -* `--g5k-skip-vpn-checks` : Skip the VPN client connection and DNS configuration checks +* `--g5k-select-node-from-reservation` : [Hostname of the node to use from the reservation](#resource-reservation) * `--g5k-reuse-ref-environment` : [Reuse the Grid'5000 reference environment instead of re-deploying the node](#grid5000-reference-environment-reuse) -* `--g5k-job-queue` : [Specify the job queue (besteffort queue is NOT supported)](#job-queues) +* `--g5k-job-queue` : [Specify the job queue (the `besteffort` queue is NOT supported)](#job-queues) * `--g5k-external-ssh-public-keys` : SSH public key(s) allowed to connect to the node (in authorized_keys format) * `--g5k-keep-resource-at-deletion` : [Keep the allocated resource when removing the machine](#resource-reservation) #### Flags usage -| Option | Environment | Default value | -|-----------------------------------|---------------------------------|-----------------------| -| `--g5k-username` | `G5K_USERNAME` | | -| `--g5k-password` | `G5K_PASSWORD` | | -| `--g5k-site` | `G5K_SITE` | | -| `--g5k-walltime` | `G5K_WALLTIME` | "1:00:00" | -| `--g5k-image` | `G5K_IMAGE` | "debian9-x64-std" | -| `--g5k-resource-properties` | `G5K_RESOURCE_PROPERTIES` | | -| `--g5k-make-resource-reservation` | `G5K_MAKE_RESOURCE_RESERVATION` | | -| `--g5k-use-resource-reservation` | `G5K_USE_RESOURCE_RESERVATION` | | -| `--g5k-skip-vpn-checks` | `G5K_SKIP_VPN_CHECKS` | False | -| `--g5k-reuse-ref-environment` | `G5K_REUSE_REF_ENVIRONMENT` | False | -| `--g5k-job-queue` | `G5K_JOB_QUEUE` | "default" | -| `--g5k-external-ssh-public-keys` | `G5K_EXTERNAL_SSH_PUBLIC_KEYS` | | -| `--g5k-keep-resource-at-deletion` | `G5K_KEEP_RESOURCE_AT_DELETION` | False | +| Flag name | Environment variable | Default value | +|--------------------------------------|------------------------------------|-----------------------| +| `--g5k-username` | `G5K_USERNAME` | | +| `--g5k-password` | `G5K_PASSWORD` | | +| `--g5k-site` | `G5K_SITE` | | +| `--g5k-walltime` | `G5K_WALLTIME` | "1:00:00" | +| `--g5k-image` | `G5K_IMAGE` | "debian9-x64-std" | +| `--g5k-resource-properties` | `G5K_RESOURCE_PROPERTIES` | | +| `--g5k-make-resource-reservation` | `G5K_MAKE_RESOURCE_RESERVATION` | | +| `--g5k-use-resource-reservation` | `G5K_USE_RESOURCE_RESERVATION` | | +| `--g5k-select-node-from-reservation` | `G5K_SELECT_NODE_FROM_RESERVATION` | | +| `--g5k-reuse-ref-environment` | `G5K_REUSE_REF_ENVIRONMENT` | False | +| `--g5k-job-queue` | `G5K_JOB_QUEUE` | "default" | +| `--g5k-external-ssh-public-keys` | `G5K_EXTERNAL_SSH_PUBLIC_KEYS` | | +| `--g5k-keep-resource-at-deletion` | `G5K_KEEP_RESOURCE_AT_DELETION` | False | #### Resource properties You can use [OAR properties](http://oar.imag.fr/docs/2.5/user/usecases.html#using-properties) to only select a node that matches your hardware requirements. @@ -89,12 +90,12 @@ More information about usage of OAR properties are available on the [Grid'5000 W #### Resource reservation You can either do a job submission to reserve resources as soon as possible (this is the default mode) or do an advance reservation for a specific date/time. -To do a resource reservation, you need to use the `--g5k-make-resource-reservation` flag and provide a starting date/time in either the 'YYYY-MM-DD HH:MM:SS' date format or an UNIX timestamp. +To do a resource reservation, you need to use the `--g5k-make-resource-reservation` flag and provide a starting date/time in either the `YYYY-MM-DD HH:MM:SS` date format or an UNIX timestamp. Don't forget to save the job ID of your reservation in order to be able to create a machine when the resources will be available. To use a resource reservation, set the `--g5k-use-resource-reservation` flag with the job ID of an existing reservation. -This will create a machine, deploy an OS image and provision Docker on the node. -Please note that the job must be in `running` state in order for the machine to be created, otherwise the driver will wait until the job start. +In case the reservation have multiple nodes, you need to select one using the `--g5k-select-node-from-reservation` flag, otherwise the first node will be taken. +This will create a machine, deploy an OS image and provision Docker on the node. Please note that the job must be in `running` state in order for the machine to be created, otherwise the driver will wait until the job start. By default the resource is automatically deallocated when you remove a machine using the `rm` command. However, you can use the `g5k-keep-resource-at-deletion` flag when creating the machine to keep the resource allocated even when the machine is removed. @@ -120,18 +121,17 @@ Error with pre-create check: "Error when submitting new job: The server returned See [this page](https://www.grid5000.fr/mediawiki/index.php/Grid5000:UsagePolicy#Rules_for_the_production_queue) for more information about the production queue. ### Usage examples -An example of node provisioning reusing the Grid'5000 standard environment: +An example reusing the Grid'5000 standard environment: ```bash docker-machine create -d g5k \ --g5k-username "user" \ --g5k-password "********" \ --g5k-site "lille" \ ---engine-storage-driver "overlay2" \ --g5k-reuse-ref-environment \ test-node ``` -An example of node provisioning deploying the `debian9-x64-min` environment on the node: +An example deploying the `debian9-x64-min` environment image on the node: ```bash docker-machine create -d g5k \ --g5k-username "user" \ @@ -141,7 +141,7 @@ docker-machine create -d g5k \ test-node ``` -An example of node provisioning using environment variables: +An example using environment variables to configure the driver: ```bash export G5K_USERNAME="user" export G5K_PASSWORD="********" @@ -149,7 +149,7 @@ export G5K_SITE="lille" docker-machine create -d g5k test-node ``` -An example with resource properties (node in cluster `chimint` with more thant 8GB of RAM and at least 4 CPU cores): +An example using resource properties (node in `chimint` cluster having more than 8GB of RAM and at least 4 CPU cores): ```bash docker-machine create -d g5k \ --g5k-username "user" \ @@ -159,7 +159,7 @@ docker-machine create -d g5k \ test-node ``` -An example of doing a resource reservation of 1 node for `8 hours` starting the `2019-01-01` at `20:00:00`: +An example doing a resource reservation of 1 node for `8 hours` starting the `2019-01-01` at `20:00:00`: ```bash docker-machine create -d g5k \ --g5k-username "user" \ @@ -170,22 +170,34 @@ docker-machine create -d g5k \ test-node ``` -An example of using a resource reservation: +An example using the first node of a resource reservation having the `1234567` job ID: ```bash docker-machine create -d g5k \ --g5k-username "user" \ --g5k-password "********" \ --g5k-site "lille" \ ---g5k-use-resource-reservation 1234567 \ +--g5k-use-resource-reservation "1234567" \ test-node -``` +``` -An example adding multiple external SSH keys (your keys can be of any supported format, and may be longer than the following example): +An example using the `chifflet-2.lille.grid5000.fr` node of a resource reservation having the `1234567` job ID: +```bash +docker-machine create -d g5k \ +--g5k-username "user" \ +--g5k-password "********" \ +--g5k-site "lille" \ +--g5k-use-resource-reservation "1234567" \ +--g5k-select-node-from-reservation "chifflet-2.lille.grid5000.fr" \ +test-node +``` + +An example adding two external SSH keys (the keys can be of any supported type and size): ```bash docker-machine create -d g5k \ --g5k-username "user" \ --g5k-password "********" \ --g5k-site "lille" \ --g5k-external-ssh-public-keys "ssh-ed25519 AAAAC3NzaC1lZDI1NTE5AAAAIFLs3JzUYn7LbHE+SzJNoMvYbasnhjlen0k6dFs801DT test-ed25519" \ ---g5k-external-ssh-public-keys "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAAgQC5qQt/nzGW19uCb9CDVEvP93LZ2mu3rd7drPP1nLf1pzLwlL2U2ksfwDCjMWU0P7KA6tB4scI+4dhxj07t0Z8g4TsMGYhbG0kjf7tWN73DombB4h/zobo2GvVoMg0NBLTP4peXLYAEofTYc0g7OWtJicAzLwcMzHsitDjjBwCKHQ== test-rsa" +--g5k-external-ssh-public-keys "ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAAAgQC5qQt/nzGW19uCb9CDVEvP93LZ2mu3rd7drPP1nLf1pzLwlL2U2ksfwDCjMWU0P7KA6tB4scI+4dhxj07t0Z8g4TsMGYhbG0kjf7tWN73DombB4h/zobo2GvVoMg0NBLTP4peXLYAEofTYc0g7OWtJicAzLwcMzHsitDjjBwCKHQ== test-rsa" \ +test-node ```