diff --git a/.github/CODEOWNERS b/.github/CODEOWNERS index 7cd3d1dab2..21611cd84d 100644 --- a/.github/CODEOWNERS +++ b/.github/CODEOWNERS @@ -2,137 +2,71 @@ # the repo. Unless a later match takes precedence, these will # be requested for review when someone opens a pull request. -* @eriknordmark - -/pkg/edgeview/ @naiming-zededa -/pkg/newlog/ @naiming-zededa - - -/api/ @deitch -/build-tools/ @deitch -/libs/zedUpload/ @deitch -/pkg/newlog/ @deitch -/pkg/pillar/cas/ @deitch -/pkg/pillar/cmd/baseosmgr/ @deitch -/pkg/pillar/cmd/downloader/ @deitch -/pkg/pillar/cmd/volumemgr/ @deitch -/pkg/pillar/containerd/ @deitch -/pkg/rngd/ @deitch -/tools/ @deitch - - -/libs/zedUpload/ @christoph-zededa -/pkg/alpine/ @christoph-zededa -/pkg/dom0-ztools/ @christoph-zededa -/pkg/edgeview/ @christoph-zededa -/pkg/pillar/cmd/zedrouter/ @christoph-zededa -/pkg/pillar/zedcloud/ @christoph-zededa - - -/pkg/mkimage-raw-efi/ @jsfakian -/pkg/mkverification-raw-efi/ @jsfakian -/pkg/pillar/cmd/domainmgr/ @jsfakian -/pkg/verification/ @jsfakian -/tools/ @jsfakian - - -/pkg/grub/ @mikem-zed -/pkg/kernel/ @mikem-zed -/pkg/measure-config/ @mikem-zed -/pkg/new-kernel/ @mikem-zed -/pkg/pillar/cmd/ledmanager/ @mikem-zed -/pkg/pillar/cmd/tpmmgr/ @mikem-zed -/pkg/pillar/evetpm/ @mikem-zed -/pkg/pillar/hypervisor/ @mikem-zed -/pkg/uefi/ @mikem-zed -/pkg/xen-tools/ @mikem-zed -/pkg/xen/ @mikem-zed - - -/libs/depgraph/ @milan-zededa -/libs/nettrace/ @milan-zededa -/libs/reconciler/ @milan-zededa -/pkg/pillar/cmd/domainmgr/ @milan-zededa -/pkg/pillar/cmd/downloader/ @milan-zededa -/pkg/pillar/cmd/nim/ @milan-zededa -/pkg/pillar/cmd/zedagent/ @milan-zededa -/pkg/pillar/cmd/zedrouter/ @milan-zededa -/pkg/pillar/devicenetwork/ @milan-zededa -/pkg/pillar/dpcmanager/ @milan-zededa -/pkg/pillar/dpcreconciler/ @milan-zededa -/pkg/pillar/iptables/ @milan-zededa -/pkg/pillar/netdump/ @milan-zededa -/pkg/pillar/netmonitor/ @milan-zededa -/pkg/pillar/nireconciler/ @milan-zededa -/pkg/pillar/nistate/ @milan-zededa -/pkg/pillar/uplinkprober/ @milan-zededa -/pkg/pillar/utils/ @milan-zededa -/pkg/wwan/ @milan-zededa -/tests/eden/ @milan-zededa - - -/pkg/pillar/cmd/domainmgr/ @OhmSpectator -/pkg/pillar/cmd/volumemgr/ @OhmSpectator -/pkg/pillar/cmd/zedagent/ @OhmSpectator -/pkg/pillar/cmd/zedmanager/ @OhmSpectator -/pkg/pillar/cpuallocator/ @OhmSpectator -/pkg/pillar/hypervisor/ @OhmSpectator -/pkg/pillar/volumehandlers/ @OhmSpectator -/pkg/xen-tools/ @OhmSpectator -/pkg/xen/ @OhmSpectator - - -/pkg/pillar/cmd/domainmgr/ @uncleDecart -/pkg/pillar/cmd/zedagent/ @uncleDecart -/pkg/pillar/sriov/ @uncleDecart -/tests/eden/ @uncleDecart - - -/pkg/kube/ @zedi-pramodh -/pkg/mkimage-raw-efi/ @zedi-pramodh -/pkg/pillar/zfs/ @zedi-pramodh - - -/pkg/bsp-imx/ @rene -/pkg/cross-compilers/ @rene -/pkg/debug/lshw/ @rene -/pkg/fw/ @rene -/pkg/grub/ @rene -/pkg/kernel/ @rene -/pkg/new-kernel/ @rene -/pkg/optee-os/ @rene -/pkg/pillar/cmd/ledmanager/ @rene -/pkg/pillar/hypervisor/ @rene -/pkg/u-boot/ @rene -/pkg/xen-tools/ @rene -/pkg/xen/ @rene - - -/libs/zedUpload/ @rouming -/pkg/debug/ @rouming -/pkg/dom0-ztools/ @rouming -/pkg/kdump/ @rouming -/pkg/kernel/ @rouming -/pkg/kexec/ @rouming -/pkg/new-kernel/ @rouming -/pkg/pillar/cmd/downloader/ @rouming -/pkg/pillar/cmd/volumemgr/ @rouming -/pkg/pillar/cmd/zedagent/ @rouming -/pkg/pillar/cmd/zedmanager/ @rouming -/pkg/pillar/cmd/zedrouter/ @rouming -/pkg/pillar/containerd/ @rouming -/pkg/pillar/hypervisor/ @rouming -/pkg/pillar/zedcloud/ @rouming -/pkg/storage-init/ @rouming - - -/pkg/apparmor/ @shjala -/pkg/dom0-tools/ @shjala -/pkg/kernel/ @shjala -/pkg/new-kernel/ @shjala -/pkg/pillar/cmd/tpmmgr/ @shjala -/pkg/pillar/evetpm/ @shjala -/pkg/pillar/hypervisor/ @shjala -/pkg/vtpm/ @shjala -/pkg/xen-tools/ @shjala -/pkg/xen/ @shjala +* @eriknordmark +/api/ @deitch +/build-tools/ @deitch +/libs/depgraph/ @milan-zededa +/libs/nettrace/ @milan-zededa +/libs/reconciler/ @milan-zededa +/libs/zedUpload/ @christoph-zededa @deitch @rouming +/pkg/alpine/ @christoph-zededa +/pkg/apparmor/ @shjala +/pkg/bsp-imx/ @rene +/pkg/cross-compilers/ @rene +/pkg/debug/ @rouming +/pkg/debug/lshw/ @rene +/pkg/dom0-tools/ @shjala +/pkg/dom0-ztools/ @christoph-zededa @rouming +/pkg/edgeview/ @christoph-zededa @naiming-zededa +/pkg/fw/ @rene +/pkg/grub/ @mikem-zed @rene +/pkg/kdump/ @rouming +/pkg/kernel/ @mikem-zed @rene @rouming @shjala +/pkg/kexec/ @rouming +/pkg/kube/ @zedi-pramodh +/pkg/measure-config/ @mikem-zed +/pkg/mkimage-raw-efi/ @jsfakian @zedi-pramodh +/pkg/mkverification-raw-efi/ @jsfakian +/pkg/new-kernel/ @mikem-zed @rene @rouming @shjala +/pkg/newlog/ @deitch @naiming-zededa +/pkg/optee-os/ @rene +/pkg/pillar/cas/ @deitch +/pkg/pillar/cmd/baseosmgr/ @deitch +/pkg/pillar/cmd/domainmgr/ @OhmSpectator @jsfakian @milan-zededa @uncleDecart +/pkg/pillar/cmd/downloader/ @deitch @milan-zededa @rouming +/pkg/pillar/cmd/ledmanager/ @mikem-zed @rene +/pkg/pillar/cmd/nim/ @milan-zededa +/pkg/pillar/cmd/tpmmgr/ @mikem-zed @shjala +/pkg/pillar/cmd/volumemgr/ @OhmSpectator @deitch @rouming +/pkg/pillar/cmd/zedagent/ @OhmSpectator @milan-zededa @rouming @uncleDecart +/pkg/pillar/cmd/zedmanager/ @OhmSpectator @rouming +/pkg/pillar/cmd/zedrouter/ @christoph-zededa @milan-zededa @rouming +/pkg/pillar/containerd/ @deitch @rouming +/pkg/pillar/cpuallocator/ @OhmSpectator +/pkg/pillar/devicenetwork/ @milan-zededa +/pkg/pillar/dpcmanager/ @milan-zededa +/pkg/pillar/dpcreconciler/ @milan-zededa +/pkg/pillar/evetpm/ @mikem-zed @shjala +/pkg/pillar/hypervisor/ @OhmSpectator @mikem-zed @rene @rouming @shjala +/pkg/pillar/iptables/ @milan-zededa +/pkg/pillar/netdump/ @milan-zededa +/pkg/pillar/netmonitor/ @milan-zededa +/pkg/pillar/nireconciler/ @milan-zededa +/pkg/pillar/nistate/ @milan-zededa +/pkg/pillar/sriov/ @uncleDecart +/pkg/pillar/uplinkprober/ @milan-zededa +/pkg/pillar/utils/ @milan-zededa +/pkg/pillar/volumehandlers/ @OhmSpectator +/pkg/pillar/zedcloud/ @christoph-zededa @rouming +/pkg/pillar/zfs/ @zedi-pramodh +/pkg/rngd/ @deitch +/pkg/storage-init/ @rouming +/pkg/u-boot/ @rene +/pkg/uefi/ @mikem-zed +/pkg/verification/ @jsfakian +/pkg/vtpm/ @shjala +/pkg/wwan/ @milan-zededa +/pkg/xen-tools/ @OhmSpectator @mikem-zed @rene @shjala +/pkg/xen/ @OhmSpectator @mikem-zed @rene @shjala +/tests/eden/ @milan-zededa @uncleDecart +/tools/ @deitch @jsfakian diff --git a/Makefile b/Makefile index 98bef2c05b..b10fa0c9fb 100644 --- a/Makefile +++ b/Makefile @@ -731,12 +731,12 @@ publish_sources: $(COLLECTED_SOURCES) $(LIVE).raw: $(BOOT_PART) $(EFI_PART) $(ROOTFS_IMG) $(CONFIG_IMG) $(PERSIST_IMG) $(BSP_IMX_PART) | $(INSTALLER) ./tools/prepare-platform.sh "$(PLATFORM)" "$(BUILD_DIR)" "$(INSTALLER)" || : - ./tools/makeflash.sh -C 559 $| $@ $(PART_SPEC) + ./tools/makeflash.sh "mkimage-raw-efi" -C 559 $| $@ $(PART_SPEC) $(QUIET): $@: Succeeded $(INSTALLER).raw: $(BOOT_PART) $(EFI_PART) $(ROOTFS_IMG) $(INITRD_IMG) $(INSTALLER_IMG) $(CONFIG_IMG) $(PERSIST_IMG) $(BSP_IMX_PART) | $(INSTALLER) ./tools/prepare-platform.sh "$(PLATFORM)" "$(BUILD_DIR)" "$(INSTALLER)" || : - ./tools/makeflash.sh -C 592 $| $@ "conf_win installer inventory_win" + ./tools/makeflash.sh "mkimage-raw-efi" -C 592 $| $@ "conf_win installer inventory_win" $(QUIET): $@: Succeeded $(INSTALLER).iso: $(EFI_PART) $(ROOTFS_IMG) $(INITRD_IMG) $(INSTALLER_IMG) $(CONFIG_IMG) $(PERSIST_IMG) | $(INSTALLER) @@ -760,7 +760,7 @@ $(LIVE).parallels: $(LIVE).raw $(VERIFICATION).raw: $(BOOT_PART) $(EFI_PART) $(ROOTFS_IMG) $(INITRD_IMG) $(VERIFICATION_IMG) $(CONFIG_IMG) $(PERSIST_IMG) $(BSP_IMX_PART) | $(VERIFICATION) @cp -r $(INSTALLER)/* $(VERIFICATION) ./tools/prepare-platform.sh "$(PLATFORM)" "$(BUILD_DIR)" "$(VERIFICATION)" || : - ./tools/makeverification.sh -C 850 $| $@ "conf_win verification inventory_win" + ./tools/makeflash.sh "mkverification-raw-efi" -C 850 $| $@ "conf_win verification inventory_win" $(QUIET): $@: Succeeded $(VERIFICATION).net: $(EFI_PART) $(ROOTFS_IMG) $(INITRD_IMG) $(VERIFICATION_IMG) $(CONFIG_IMG) $(PERSIST_IMG) $(KERNEL_IMG) | $(VERIFICATION) @@ -806,6 +806,7 @@ verification: $(VERIFICATION) $(VERIFICATION_ARTIFACTS) current | $(BUILD_DIR) $(QUIET)if [ -n "$(EVE_REL)" ] && [ $(HV) = $(HV_DEFAULT) ]; then \ $(LINUXKIT) $(DASH_V) pkg $(LINUXKIT_PKG_TARGET) --platforms linux/$(ZARCH) --hash-path $(CURDIR) --hash $(EVE_REL)-$(HV) --docker --release $(EVE_REL) $(FORCE_BUILD) $| ;\ fi + cp -r $|/installer/* $|/verification $(QUIET): $@: Succeeded .PHONY: image-set outfile-set cache-export cache-export-docker-load cache-export-docker-load-all diff --git a/pkg/kernel/kernel-config/kernel_config-5.10.x-preempt-rt-x86_64.patch b/pkg/kernel/kernel-config/kernel_config-5.10.x-preempt-rt-x86_64.patch index d13d69610b..23a9020a7c 100644 --- a/pkg/kernel/kernel-config/kernel_config-5.10.x-preempt-rt-x86_64.patch +++ b/pkg/kernel/kernel-config/kernel_config-5.10.x-preempt-rt-x86_64.patch @@ -165,7 +165,7 @@ # end of Scheduler Debugging # CONFIG_DEBUG_TIMEKEEPING is not set -+CONFIG_DEBUG_PREEMPT=y ++# CONFIG_DEBUG_PREEMPT is not set # # Lock Debugging (spinlocks, mutexes, etc...) diff --git a/pkg/pillar/base/logobjecttypes.go b/pkg/pillar/base/logobjecttypes.go index 1559c59809..09ac874a8e 100644 --- a/pkg/pillar/base/logobjecttypes.go +++ b/pkg/pillar/base/logobjecttypes.go @@ -164,6 +164,8 @@ const ( EncryptedVaultKeyFromDeviceLogType LogObjectType = "encrypted_vault_key_from_device" // EncryptedVaultKeyFromControllerLogType: EncryptedVaultKeyFromControllerLogType LogObjectType = "encrypted_vault_key_from_controller" + // CachedResolvedIPsLogType: + CachedResolvedIPsLogType LogObjectType = "cached_resolved_ips" ) // RelationObjectType : diff --git a/pkg/pillar/cmd/client/client.go b/pkg/pillar/cmd/client/client.go index 4a117a9c67..bfff7c2f16 100644 --- a/pkg/pillar/cmd/client/client.go +++ b/pkg/pillar/cmd/client/client.go @@ -66,6 +66,7 @@ type clientContext struct { usableAddressCount int networkState types.DPCState subGlobalConfig pubsub.Subscription + subCachedResolvedIPs pubsub.Subscription globalConfig *types.ConfigItemValueMap zedcloudCtx *zedcloud.ZedCloudContext getCertsTimer *time.Timer @@ -97,6 +98,16 @@ func (ctxPtr *clientContext) ProcessAgentSpecificCLIFlags(flagSet *flag.FlagSet) } } +func (ctxPtr *clientContext) getCachedResolvedIPs(hostname string) []types.CachedIP { + if ctxPtr.subCachedResolvedIPs == nil { + return nil + } + if item, err := ctxPtr.subCachedResolvedIPs.Get(hostname); err == nil { + return item.(types.CachedResolvedIPs).CachedIPs + } + return nil +} + var ( serverNameAndPort string onboardTLSConfig *tls.Config @@ -179,6 +190,19 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar clientCtx.subGlobalConfig = subGlobalConfig subGlobalConfig.Activate() + subCachedResolvedIPs, err := ps.NewSubscription(pubsub.SubscriptionOptions{ + AgentName: "nim", + MyAgentName: agentName, + WarningTime: warningTime, + ErrorTime: errorTime, + TopicImpl: types.CachedResolvedIPs{}, + Activate: true, + }) + if err != nil { + log.Fatal(err) + } + clientCtx.subCachedResolvedIPs = subCachedResolvedIPs + subDeviceNetworkStatus, err := ps.NewSubscription(pubsub.SubscriptionOptions{ CreateHandler: handleDNSCreate, ModifyHandler: handleDNSModify, @@ -196,13 +220,14 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar clientCtx.subDeviceNetworkStatus = subDeviceNetworkStatus subDeviceNetworkStatus.Activate() zedcloudCtx := zedcloud.NewContext(log, zedcloud.ContextOptions{ - DevNetworkStatus: clientCtx.deviceNetworkStatus, - SendTimeout: clientCtx.globalConfig.GlobalValueInt(types.NetworkSendTimeout), - DialTimeout: clientCtx.globalConfig.GlobalValueInt(types.NetworkDialTimeout), - AgentMetrics: clientCtx.zedcloudMetrics, - Serial: hardware.GetProductSerial(log), - SoftSerial: hardware.GetSoftSerial(log), - AgentName: agentName, + DevNetworkStatus: clientCtx.deviceNetworkStatus, + SendTimeout: clientCtx.globalConfig.GlobalValueInt(types.NetworkSendTimeout), + DialTimeout: clientCtx.globalConfig.GlobalValueInt(types.NetworkDialTimeout), + ResolverCacheFunc: clientCtx.getCachedResolvedIPs, + AgentMetrics: clientCtx.zedcloudMetrics, + Serial: hardware.GetProductSerial(log), + SoftSerial: hardware.GetSoftSerial(log), + AgentName: agentName, }) clientCtx.zedcloudCtx = &zedcloudCtx @@ -355,6 +380,9 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar return ret } + case change := <-subCachedResolvedIPs.MsgChan(): + subCachedResolvedIPs.ProcessChange(change) + case <-ticker.C: // Check in case /config/server changes while running nserver, err := os.ReadFile(types.ServerFileName) diff --git a/pkg/pillar/cmd/loguploader/loguploader.go b/pkg/pillar/cmd/loguploader/loguploader.go index 49a385e93c..934ff93d7f 100644 --- a/pkg/pillar/cmd/loguploader/loguploader.go +++ b/pkg/pillar/cmd/loguploader/loguploader.go @@ -76,6 +76,7 @@ type loguploaderContext struct { subDeviceNetworkStatus pubsub.Subscription subGlobalConfig pubsub.Subscription subAppInstConfig pubsub.Subscription + subCachedResolvedIPs pubsub.Subscription usableAddrCount int metrics types.NewlogMetrics zedcloudMetrics *zedcloud.AgentMetrics @@ -86,6 +87,16 @@ type loguploaderContext struct { backoffExprTimer *time.Timer } +func (ctx *loguploaderContext) getCachedResolvedIPs(hostname string) []types.CachedIP { + if ctx.subCachedResolvedIPs == nil { + return nil + } + if item, err := ctx.subCachedResolvedIPs.Get(hostname); err == nil { + return item.(types.CachedResolvedIPs).CachedIPs + } + return nil +} + // Run - an loguploader run func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, arguments []string) int { logger = loggerArg @@ -205,6 +216,19 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar } log.Functionf("Have %d management ports with usable addresses", loguploaderCtx.usableAddrCount) + subCachedResolvedIPs, err := ps.NewSubscription(pubsub.SubscriptionOptions{ + AgentName: "nim", + MyAgentName: agentName, + WarningTime: warningTime, + ErrorTime: errorTime, + TopicImpl: types.CachedResolvedIPs{}, + Activate: true, + }) + if err != nil { + log.Fatal(err) + } + loguploaderCtx.subCachedResolvedIPs = subCachedResolvedIPs + // Publish cloud metrics pubCloud, err := ps.NewPublication( pubsub.PublicationOptions{ @@ -279,6 +303,9 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar case change := <-subAppInstConfig.MsgChan(): subAppInstConfig.ProcessChange(change) + case change := <-subCachedResolvedIPs.MsgChan(): + subCachedResolvedIPs.ProcessChange(change) + case <-publishCloudTimer.C: start := time.Now() log.Tracef("publishCloudTimer cloud metrics at at %s", time.Now().String()) @@ -403,13 +430,14 @@ func sendCtxInit(ctx *loguploaderContext) { //set newlog url zedcloudCtx := zedcloud.NewContext(log, zedcloud.ContextOptions{ - DevNetworkStatus: deviceNetworkStatus, - SendTimeout: ctx.globalConfig.GlobalValueInt(types.NetworkSendTimeout), - DialTimeout: ctx.globalConfig.GlobalValueInt(types.NetworkDialTimeout), - AgentMetrics: ctx.zedcloudMetrics, - Serial: hardware.GetProductSerial(log), - SoftSerial: hardware.GetSoftSerial(log), - AgentName: agentName, + DevNetworkStatus: deviceNetworkStatus, + SendTimeout: ctx.globalConfig.GlobalValueInt(types.NetworkSendTimeout), + DialTimeout: ctx.globalConfig.GlobalValueInt(types.NetworkDialTimeout), + AgentMetrics: ctx.zedcloudMetrics, + ResolverCacheFunc: ctx.getCachedResolvedIPs, + Serial: hardware.GetProductSerial(log), + SoftSerial: hardware.GetSoftSerial(log), + AgentName: agentName, }) zedcloudCtx.DevUUID = ctx.devUUID diff --git a/pkg/pillar/cmd/nim/controllerdns.go b/pkg/pillar/cmd/nim/controllerdns.go deleted file mode 100644 index b97a23c3e0..0000000000 --- a/pkg/pillar/cmd/nim/controllerdns.go +++ /dev/null @@ -1,224 +0,0 @@ -// Copyright (c) 2021 Zededa, Inc. -// SPDX-License-Identifier: Apache-2.0 - -package nim - -import ( - "bytes" - "errors" - "fmt" - "io/fs" - "os" - "time" - - "github.com/lf-edge/eve/pkg/pillar/devicenetwork" - "github.com/lf-edge/eve/pkg/pillar/types" -) - -const ( - minTTLSec int = 30 - maxTTLSec int = 3600 - extraSec int = 10 - etcHostFileName = "/etc/hosts" - tmpHostFileName = "/tmp/etchosts" - resolvFileName = "/etc/resolv.conf" -) - -// go routine for dns query to the controller -func (n *nim) queryControllerDNS() { - var etchosts, controllerServer []byte - var ttlSec int - var ipaddrCached string - - if _, err := os.Stat(etcHostFileName); err == nil { - etchosts, err = os.ReadFile(etcHostFileName) - if err == nil { - controllerServer, _ = os.ReadFile(types.ServerFileName) - controllerServer = bytes.TrimSuffix(controllerServer, []byte("\n")) - if bytes.Contains(controllerServer, []byte(":")) { - serverport := bytes.Split(controllerServer, []byte(":")) - if len(serverport) == 2 { - controllerServer = serverport[0] - } - } - } - } - - if len(controllerServer) == 0 { - n.Log.Errorf("can't read /etc/hosts or server file") - return - } - - dnsTimer := time.NewTimer(time.Duration(minTTLSec) * time.Second) - - wdName := agentName + "dnsQuery" - stillRunning := time.NewTicker(stillRunTime) - n.PubSub.StillRunning(wdName, warningTime, errorTime) - n.PubSub.RegisterFileWatchdog(wdName) - - for { - select { - case <-dnsTimer.C: - // base on ttl from server dns update frequency for controller IP resolve - // even if the dns server implementation returns the remaining value of the TTL it caches, - // it will still work. - ipaddrCached, ttlSec = n.controllerDNSCache(etchosts, controllerServer, ipaddrCached) - dnsTimer = time.NewTimer(time.Duration(ttlSec) * time.Second) - - case <-stillRunning.C: - } - n.PubSub.StillRunning(wdName, warningTime, errorTime) - } -} - -func (n *nim) resolveWithPorts(domain string) []devicenetwork.DNSResponse { - dnsResponse, errs := devicenetwork.ResolveWithPortsLambda( - domain, - n.dpcManager.GetDNS(), - devicenetwork.ResolveWithSrcIP, - ) - if len(errs) > 0 { - n.Log.Warnf("resolveWithPortsLambda failed: %+v", errs) - } - return dnsResponse -} - -// periodical cache the controller DNS resolution into /etc/hosts file -// it returns the cached ip string, and TTL setting from the server -func (n *nim) controllerDNSCache( - etchosts, controllerServer []byte, - ipaddrCached string, -) (string, int) { - // Check to see if the server domain is already in the /etc/hosts as in eden, - // then skip this DNS queries - isCached, ipAddrCached, ttlCached := n.checkCachedEntry( - etchosts, - controllerServer, - ipaddrCached, - ) - if isCached { - return ipAddrCached, ttlCached - } - - err := os.Remove(tmpHostFileName) - if err != nil && !errors.Is(err, fs.ErrNotExist) { - n.Log.Warnf("%s exists but removing failed: %+v", tmpHostFileName, err) - } - - dnsResponses := n.resolveWithPorts(string(controllerServer)) - for _, dnsResponse := range dnsResponses { - if dnsResponse.IP.String() == ipAddrCached { - return ipAddrCached, getTTL(time.Duration(dnsResponse.TTL)) - } - } - - lookupIPaddr := n.writeHostsFile(dnsResponses, etchosts, controllerServer) - if lookupIPaddr != "" { - n.Log.Tracef("append controller IP %s to /etc/hosts", lookupIPaddr) - } - - if len(dnsResponses) > 0 { - ipaddrCached = dnsResponses[0].IP.String() - ttlSec := getTTL(time.Duration(dnsResponses[0].TTL)) - return ipaddrCached, ttlSec - } - - // No response or a failure; make sure we redo the query after 30 seconds. - return "", minTTLSec -} - -func (n *nim) writeHostsFile( - dnsResponses []devicenetwork.DNSResponse, - etchosts, controllerServer []byte, -) string { - return n.writeHostsFileToDestination(dnsResponses, etchosts, controllerServer, etcHostFileName) -} - -func (n *nim) writeHostsFileToDestination( - dnsResponses []devicenetwork.DNSResponse, - etchosts, controllerServer []byte, - destination string, -) string { - var newhosts []byte - - var lookupIPaddr string - - if len(dnsResponses) == 0 { - newhosts = append(newhosts, etchosts...) - } else { - newhosts = append([]byte{}, etchosts...) - for _, dnsResponse := range dnsResponses { - lookupIPaddr = dnsResponse.IP.String() - serverEntry := fmt.Sprintf("%s %s\n", lookupIPaddr, controllerServer) - newhosts = append(newhosts, []byte(serverEntry)...) - } - } - - err := os.WriteFile(tmpHostFileName, newhosts, 0644) - if err != nil { - n.Log.Errorf("can not write /tmp/etchosts file %v", err) - return "" - } - if err := os.Rename(tmpHostFileName, destination); err != nil { - n.Log.Errorf("can not rename %s file %v", destination, err) - return "" - } - - return lookupIPaddr -} - -func (*nim) readNameservers() []string { - var nameServers []string - dnsServer, _ := os.ReadFile(resolvFileName) - dnsRes := bytes.Split(dnsServer, []byte("\n")) - for _, d := range dnsRes { - d1 := bytes.Split(d, []byte("nameserver ")) - if len(d1) == 2 { - nameServers = append(nameServers, string(d1[1])) - } - } - if len(nameServers) == 0 { - nameServers = append(nameServers, "8.8.8.8") - } - return nameServers -} - -func (n *nim) checkCachedEntry( - etchosts []byte, - controllerServer []byte, - ipaddrCached string, -) (bool, string, int) { - if len(etchosts) == 0 || len(controllerServer) == 0 { - return true, ipaddrCached, maxTTLSec - } - - if ipaddrCached == "" { - hostsEntries := bytes.Split(etchosts, []byte("\n")) - for _, entry := range hostsEntries { - fields := bytes.Fields(entry) - if len(fields) == 2 { - if bytes.Compare(fields[1], controllerServer) == 0 { - n.Log.Tracef("server entry %s already in /etc/hosts, skip", controllerServer) - return true, ipaddrCached, maxTTLSec - } - } - } - } - return false, "", 0 -} - -func getTTL(ttl time.Duration) int { - ttlSec := int(ttl.Seconds()) - if ttlSec < minTTLSec { - // this can happen often, when the dns server returns ttl being the remaining value - // of it's own cached ttl, we set it to minTTLSec and retry. Next time will get the - // upper range value of it's remaining ttl. - ttlSec = minTTLSec - } else if ttlSec > maxTTLSec { - ttlSec = maxTTLSec - } - - // some dns server returns actual remaining time of TTL, to avoid next time - // get 0 or 1 those numbers, add some extra seconds - return ttlSec + extraSec -} diff --git a/pkg/pillar/cmd/nim/controllerdns_test.go b/pkg/pillar/cmd/nim/controllerdns_test.go deleted file mode 100644 index 75c3e39579..0000000000 --- a/pkg/pillar/cmd/nim/controllerdns_test.go +++ /dev/null @@ -1,79 +0,0 @@ -package nim - -import ( - "fmt" - "io" - "os" - "strings" - "testing" - - "github.com/lf-edge/eve/pkg/pillar/base" - "github.com/lf-edge/eve/pkg/pillar/devicenetwork" - "github.com/lf-edge/eve/pkg/pillar/dpcmanager" - "github.com/sirupsen/logrus" -) - -func createTestNim() *nim { - var n nim - - dpcManager := dpcmanager.DpcManager{} - n.dpcManager = &dpcManager - logger := logrus.StandardLogger() - log := base.NewSourceLogObject(logger, "zedagent", 1234) - n.Logger = logger - n.Log = log - - return &n -} - -func TestControllerDNSCacheIndexOutOfRange(t *testing.T) { - // Regression test for bug introduced by switching to miekg/dns - n := createTestNim() - - n.controllerDNSCache([]byte(""), []byte("1.1"), "") -} - -func TestWriteHostsFile(t *testing.T) { - n := createTestNim() - - dnsResponses := []devicenetwork.DNSResponse{ - { - IP: []byte{1, 1, 1, 1}, - }, - { - IP: []byte{1, 0, 0, 1}, - }, - } - - dnsName := "one.one.one.one" - - f, err := os.CreateTemp("", "writeHostsFile.*.etchosts") - if err != nil { - panic(err) - } - defer os.Remove(f.Name()) - f.Close() - - n.writeHostsFileToDestination(dnsResponses, []byte{}, []byte(dnsName), f.Name()) - - // reopen the file to be able to read what has been written by writeHostsFileToDestination; f.Seek(0, 0) unfortunately is not enough - f, err = os.Open(f.Name()) - if err != nil { - panic(err) - } - content, err := io.ReadAll(f) - if err != nil { - panic(err) - } - - for _, dnsResponse := range dnsResponses { - expectedContent := fmt.Sprintf("%s %s\n", dnsResponse.IP.String(), dnsName) - if !strings.Contains(string(content), expectedContent) { - t.Fatalf( - "writing to hosts file failed, expected: '%s', got: '%s'", - expectedContent, - content, - ) - } - } -} diff --git a/pkg/pillar/cmd/nim/nim.go b/pkg/pillar/cmd/nim/nim.go index ccdb542e52..7c7e937c55 100644 --- a/pkg/pillar/cmd/nim/nim.go +++ b/pkg/pillar/cmd/nim/nim.go @@ -100,6 +100,7 @@ type nim struct { pubWwanStatus pubsub.Publication pubWwanMetrics pubsub.Publication pubWwanLocationInfo pubsub.Publication + pubCachedResolvedIPs pubsub.Publication // Metrics zedcloudMetrics *zedcloud.AgentMetrics @@ -314,7 +315,7 @@ func (n *nim) run(ctx context.Context) (err error) { if err = n.subAssignableAdapters.Activate(); err != nil { return err } - go n.queryControllerDNS() + go n.runResolverCacheForController() return nil } if !waitForLastResort { @@ -514,6 +515,15 @@ func (n *nim) initPublications() (err error) { if err != nil { return err } + + n.pubCachedResolvedIPs, err = n.PubSub.NewPublication( + pubsub.PublicationOptions{ + AgentName: agentName, + TopicType: types.CachedResolvedIPs{}, + }) + if err != nil { + return err + } return nil } diff --git a/pkg/pillar/cmd/nim/resolvercache.go b/pkg/pillar/cmd/nim/resolvercache.go new file mode 100644 index 0000000000..98f3c880ad --- /dev/null +++ b/pkg/pillar/cmd/nim/resolvercache.go @@ -0,0 +1,122 @@ +// Copyright (c) 2023 Zededa, Inc. +// SPDX-License-Identifier: Apache-2.0 + +package nim + +import ( + "net" + "os" + "strings" + "time" + + "github.com/lf-edge/eve/pkg/pillar/devicenetwork" + "github.com/lf-edge/eve/pkg/pillar/types" +) + +const ( + // TTL in seconds assumed when DNS response is missing (has zero) TTL. + defaultTTL = 30 + + // How often to rerun lookup and store fresh entries into the cache. + defaultRefetchPeriod = 30 * time.Second + maxRefetchPeriod = 1 * time.Hour + refetchDelay = 3 * time.Second +) + +// Go routine that periodically resolves and caches controller IP address. +// The cached IP address can be used with SendOnIntf function to speed up +// controller API calls by avoiding repeated hostname resolutions. +func (n *nim) runResolverCacheForController() { + content, err := os.ReadFile(types.ServerFileName) + if err != nil { + n.Log.Errorf("Failed to read %s: %v; "+ + "will not run resolver cache for the controller hostname", + types.ServerFileName, err) + return + } + controllerHostname := string(content) + controllerHostname = strings.TrimSpace(controllerHostname) + if host, _, err := net.SplitHostPort(controllerHostname); err == nil { + controllerHostname = host + } + if net.ParseIP(controllerHostname) != nil { + // Controller hostname is already defined as an IP address. + return + } + + dnsQueryTimer := time.NewTimer(defaultRefetchPeriod) + + wdName := agentName + "-resolverCache" + stillRunning := time.NewTicker(stillRunTime) + n.PubSub.StillRunning(wdName, warningTime, errorTime) + n.PubSub.RegisterFileWatchdog(wdName) + + for { + select { + case <-dnsQueryTimer.C: + // Use smallest returned TTL as the update frequency. + // Even if the DNS server implementation returns the remaining value + // of the TTL it caches, it will still work. + minTTL := n.resolveAndCacheIP(controllerHostname) + var retryAfter time.Duration + if minTTL == 0 { + // No response or a failure; make sure we redo the query after 30 seconds. + retryAfter = defaultRefetchPeriod + } else { + retryAfter = time.Duration(minTTL) * time.Second + // DNS server may return TTL as the remaining time of its own cached TTL. + // In order to avoid re-fetching controller IP when TTL is close to zero + // (and thus caching it is practically pointless), we wait few extra + // seconds before running DNS query again. + retryAfter += refetchDelay + } + // Make sure we do not stop re-fetching for a long time if the returned + // TTL is some crazy high value. + if retryAfter > maxRefetchPeriod { + retryAfter = maxRefetchPeriod + } + dnsQueryTimer = time.NewTimer(retryAfter) + + case <-stillRunning.C: + } + n.PubSub.StillRunning(wdName, warningTime, errorTime) + } +} + +func (n *nim) doDNSQuery(hostname string) []devicenetwork.DNSResponse { + dnsResponse, errs := devicenetwork.ResolveWithPortsLambda( + hostname, + n.dpcManager.GetDNS(), + devicenetwork.ResolveWithSrcIP, + ) + if len(errs) > 0 { + n.Log.Warnf("doDNSQuery failed: %+v", errs) + } + return dnsResponse +} + +// Try to resolve the IP address for the given hostname and cache it using pubsub. +// Currently used only for the controller hostname. +func (n *nim) resolveAndCacheIP(hostname string) (minTTL uint32) { + queryTime := time.Now() + dnsResponses := n.doDNSQuery(hostname) + cachedData := types.CachedResolvedIPs{Hostname: hostname} + for _, dnsResp := range dnsResponses { + if dnsResp.TTL == 0 { + dnsResp.TTL = defaultTTL + } + cachedData.CachedIPs = append(cachedData.CachedIPs, types.CachedIP{ + IPAddress: dnsResp.IP, + ValidUntil: queryTime.Add(time.Duration(dnsResp.TTL) * time.Second), + }) + if minTTL == 0 || dnsResp.TTL < minTTL { + minTTL = dnsResp.TTL + } + } + err := n.pubCachedResolvedIPs.Publish(hostname, cachedData) + if err != nil { + n.Log.Errorf("Failed to cache resolved IPs for hostname %s: %v", + hostname, err) + } + return minTTL +} diff --git a/pkg/pillar/cmd/zedagent/handleconfig.go b/pkg/pillar/cmd/zedagent/handleconfig.go index 508fa5e8ed..e937312f73 100644 --- a/pkg/pillar/cmd/zedagent/handleconfig.go +++ b/pkg/pillar/cmd/zedagent/handleconfig.go @@ -103,8 +103,9 @@ type getconfigContext struct { pubVolumeConfig pubsub.Publication pubDisksConfig pubsub.Publication pubEdgeNodeInfo pubsub.Publication + subCachedResolvedIPs pubsub.Subscription NodeAgentStatus *types.NodeAgentStatus - configProcessingSkipFlag bool + configProcessingRV configProcessingRetval lastReceivedConfig time.Time // controller or local clocks lastProcessedConfig time.Time // controller or local clocks lastConfigTimestamp time.Time // controller clocks (zero if not available) @@ -152,6 +153,16 @@ type getconfigContext struct { cipherContexts map[string]types.CipherContext } +func (ctx *getconfigContext) getCachedResolvedIPs(hostname string) []types.CachedIP { + if ctx.subCachedResolvedIPs == nil { + return nil + } + if item, err := ctx.subCachedResolvedIPs.Get(hostname); err == nil { + return item.(types.CachedResolvedIPs).CachedIPs + } + return nil +} + // current devUUID from OnboardingStatus var devUUID uuid.UUID @@ -185,14 +196,39 @@ func (s configSource) String() string { type configProcessingRetval int const ( - configOK configProcessingRetval = iota - configReqFailed // failed to request latest config - obsoleteConfig // newer config is already applied - invalidConfig // config is not valid (cannot be parsed, UUID mismatch, bad signature, etc.) - skipConfig // reboot or shutdown flag is set - defferConfig // not ready to process config yet + configOK configProcessingRetval = iota + configReqFailed // failed to request latest config + obsoleteConfig // newer config is already applied + invalidConfig // config is not valid (cannot be parsed, UUID mismatch, bad signature, etc.) + skipConfigReboot // reboot or shutdown flag is set + skipConfigUpdate // update flag is set + deferConfig // not ready to process config yet ) +func (r configProcessingRetval) isSkip() bool { + return r == skipConfigReboot || r == skipConfigUpdate +} + +func (r configProcessingRetval) String() string { + switch r { + case configOK: + return "configOK" + case configReqFailed: + return "configReqFailed" + case obsoleteConfig: + return "obsoleteConfig" + case invalidConfig: + return "invalidConfig" + case skipConfigReboot: + return "skipConfigReboot" + case skipConfigUpdate: + return "skipConfigUpdate" + case deferConfig: + return "deferConfig" + } + return "" +} + // Load bootstrap config provided that: // - it exists // - has not been loaded before (incl. previous device boots) @@ -304,7 +340,8 @@ func indicateInvalidBootstrapConfig(getconfigCtx *getconfigContext) { getconfigCtx.ledBlinkCount = types.LedBlinkInvalidBootstrapConfig } -func initZedcloudContext(networkSendTimeout, networkDialTimeout uint32, +func initZedcloudContext(getconfigCtx *getconfigContext, + networkSendTimeout, networkDialTimeout uint32, agentMetrics *zedcloud.AgentMetrics) *zedcloud.ZedCloudContext { // get the server name @@ -315,13 +352,14 @@ func initZedcloudContext(networkSendTimeout, networkDialTimeout uint32, serverNameAndPort = strings.TrimSpace(string(bytes)) zedcloudCtx := zedcloud.NewContext(log, zedcloud.ContextOptions{ - DevNetworkStatus: deviceNetworkStatus, - SendTimeout: networkSendTimeout, - DialTimeout: networkDialTimeout, - AgentMetrics: agentMetrics, - Serial: hardware.GetProductSerial(log), - SoftSerial: hardware.GetSoftSerial(log), - AgentName: agentName, + DevNetworkStatus: deviceNetworkStatus, + SendTimeout: networkSendTimeout, + DialTimeout: networkDialTimeout, + AgentMetrics: agentMetrics, + Serial: hardware.GetProductSerial(log), + SoftSerial: hardware.GetSoftSerial(log), + AgentName: agentName, + ResolverCacheFunc: getconfigCtx.getCachedResolvedIPs, // Enable all net traces but packet capture, which is already covered // by NIM (for the ping request). NetTraceOpts: []nettrace.TraceOpt{ @@ -357,9 +395,8 @@ func configTimerTask(getconfigCtx *getconfigContext, handleChannel chan interfac iteration := 0 withNetTracing := traceNextConfigReq(ctx) retVal, tracedReqs := getLatestConfig(getconfigCtx, iteration, withNetTracing) - configProcessingSkipFlag := retVal == skipConfig - if configProcessingSkipFlag != getconfigCtx.configProcessingSkipFlag { - getconfigCtx.configProcessingSkipFlag = configProcessingSkipFlag + if getconfigCtx.configProcessingRV != retVal { + getconfigCtx.configProcessingRV = retVal triggerPublishDevInfo(ctx) } getconfigCtx.localServerMap.upToDate = false @@ -399,9 +436,8 @@ func configTimerTask(getconfigCtx *getconfigContext, handleChannel chan interfac withNetTracing = traceNextConfigReq(ctx) retVal, tracedReqs = getLatestConfig( getconfigCtx, iteration, withNetTracing) - configProcessingSkipFlag = retVal == skipConfig - if configProcessingSkipFlag != getconfigCtx.configProcessingSkipFlag { - getconfigCtx.configProcessingSkipFlag = configProcessingSkipFlag + if getconfigCtx.configProcessingRV != retVal { + getconfigCtx.configProcessingRV = retVal triggerPublishDevInfo(ctx) } getconfigCtx.localServerMap.upToDate = false @@ -419,8 +455,8 @@ func configTimerTask(getconfigCtx *getconfigContext, handleChannel chan interfac warningTime, errorTime) case <-stillRunning.C: - if getconfigCtx.configProcessingSkipFlag { - log.Noticef("config processing skip flag set") + if getconfigCtx.configProcessingRV != configOK { + log.Noticef("config processing flag is not OK: %s", getconfigCtx.configProcessingRV) } } ctx.ps.StillRunning(wdName, warningTime, errorTime) @@ -473,7 +509,7 @@ func updateCertTimer(configInterval uint32, tickerHandle interface{}) { // Start by trying the all the free management ports and then all the non-free // until one succeeds in communicating with the cloud. // We use the iteration argument to start at a different point each time. -// Returns a configProcessingSkipFlag +// Returns the configProcessingRetval and the traced requests if any. func requestConfigByURL(getconfigCtx *getconfigContext, url string, iteration int, withNetTracing bool) (configProcessingRetval, []netdump.TracedNetRequest) { @@ -484,7 +520,7 @@ func requestConfigByURL(getconfigCtx *getconfigContext, url string, if getconfigCtx.zedagentCtx.bootReason == types.BootReasonFirst && !getconfigCtx.zedagentCtx.publishedEdgeNodeCerts { log.Noticef("Defer fetching config until our EdgeNodeCerts have been published") - return defferConfig, nil + return deferConfig, nil } ctx := getconfigCtx.zedagentCtx const bailOnHTTPErr = false // For 4xx and 5xx HTTP errors we try other interfaces @@ -912,7 +948,7 @@ func inhaleDeviceConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevC } } - // add new BaseOS/App instances; returns configProcessingSkipFlag + // add new BaseOS/App instances; returns configProcessingSkipFlagReboot return parseConfig(getconfigCtx, config, source) } @@ -1117,7 +1153,7 @@ func publishConfigNetdump(ctx *zedagentContext, switch configRV { case configOK: topic = netDumpConfigOKTopic - case defferConfig: + case deferConfig: // There was no actual /config request so there is nothing interesting to publish. return default: diff --git a/pkg/pillar/cmd/zedagent/parseconfig.go b/pkg/pillar/cmd/zedagent/parseconfig.go index ebe544d929..696f31e017 100644 --- a/pkg/pillar/cmd/zedagent/parseconfig.go +++ b/pkg/pillar/cmd/zedagent/parseconfig.go @@ -81,9 +81,9 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig, parseLocConfig(getconfigCtx, config) // Look for timers and other settings in configItems - // Process Config items even when configProcessingSkipFlag is set. + // Process Config items even when configProcessingSkipFlagReboot is set. // Allows us to recover if the system got stuck after setting - // configProcessingSkipFlag + // configProcessingSkipFlagReboot parseConfigItems(getconfigCtx, config, source) // Did MaintenanceMode change? @@ -108,21 +108,24 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig, // Any new reboot command? if rebootFlag { log.Noticeln("Reboot flag set, skipping config processing") - return skipConfig + return skipConfigReboot } // Any new shutdown command? if shutdownFlag { log.Noticeln("Shutdown flag set, skipping config processing") - return skipConfig + return skipConfigReboot } } - if getconfigCtx.configProcessingSkipFlag || ctx.deviceReboot || ctx.deviceShutdown { + if getconfigCtx.configProcessingRV == skipConfigReboot || ctx.deviceReboot || ctx.deviceShutdown { log.Noticef("parseConfig: Ignoring config as reboot/shutdown flag set") + return skipConfigReboot } else if ctx.maintenanceMode { log.Noticef("parseConfig: Ignoring config due to maintenanceMode") } else { + // We do not ignore config if we are in the baseOS upgrade process, as we need to check the volumes + // and the baseOS image configs if source != fromBootstrap { handleControllerCertsSha(ctx, config) parseCipherContext(getconfigCtx, config) @@ -147,11 +150,22 @@ func parseConfig(getconfigCtx *getconfigContext, config *zconfig.EdgeDevConfig, parseSystemAdapterConfig(getconfigCtx, config, source, forceSystemAdaptersParse) if source != fromBootstrap { - parseBaseOS(getconfigCtx, config) + activateNewBaseOS := parseBaseOS(getconfigCtx, config) parseNetworkInstanceConfig(getconfigCtx, config) parseContentInfoConfig(getconfigCtx, config) parseVolumeConfig(getconfigCtx, config) + // We have handled the volumes, so we can now process the app instances. But we need to check if + // we are in the middle of a baseOS upgrade, and if so, we need to skip processing the app instances. + if (source == fromController && activateNewBaseOS) || + (getconfigCtx.configProcessingRV == skipConfigUpdate) { + // We need to activate the new baseOS + // before we can process the app instances + // which depend on the new baseOS + log.Noticef("parseConfig: Ignoring config as a new baseOS image is being activated") + return skipConfigUpdate + } + // parseProfile must be called before processing of app instances from config parseProfile(getconfigCtx, config) parseAppInstanceConfig(getconfigCtx, config) @@ -243,7 +257,12 @@ func shutdownAppsGlobal(ctx *zedagentContext) { var baseOSPrevConfigHash []byte func parseBaseOS(getconfigCtx *getconfigContext, - config *zconfig.EdgeDevConfig) { + config *zconfig.EdgeDevConfig) (activateNewBaseOSFlag bool) { + // activateNewBaseOSFlag is set to true if we need to activate a new baseOS: + // 1. If the config has a new baseOS image with the activate flag set to true + // 2. If the config has a previous baseOS image, but the activate flag is _switched_ from false to true + // We don't care if the active flag already was true, as that means that the process of activating has already started. + activateNewBaseOSFlag = false baseOS := config.GetBaseos() if baseOS == nil { @@ -276,16 +295,34 @@ func parseBaseOS(getconfigCtx *getconfigContext, RetryUpdateCounter: getconfigCtx.configRetryUpdateCounter, Activate: baseOS.Activate, } - // First look for deleted ones + + // Check if the BaseOsConfig already exists + prevBaseOsConfig, _ := getconfigCtx.pubBaseOsConfig.Get(cfg.Key()) + if prevBaseOsConfig == nil { + // If we don't have a BaseOsConfig with the same key already published, it's a new one + // Check for activation flag + if cfg.Activate { + activateNewBaseOSFlag = true + } + } + + // Go through all published BaseOsConfig's and delete the ones which are not in the config + // and detect if we have a BaseOsConfig which has changed from Activate=false to Activate=true items := getconfigCtx.pubBaseOsConfig.GetAll() for idStr := range items { if idStr != cfg.Key() { log.Functionf("parseBaseOS: deleting %s\n", idStr) unpublishBaseOsConfig(getconfigCtx, idStr) + } else { + if !items[idStr].(types.BaseOsConfig).Activate && cfg.Activate { + log.Functionf("parseBaseOS: Activate set for %s", idStr) + activateNewBaseOSFlag = true + } } } // publish new one publishBaseOsConfig(getconfigCtx, cfg) + return } var networkConfigPrevConfigHash []byte diff --git a/pkg/pillar/cmd/zedagent/zedagent.go b/pkg/pillar/cmd/zedagent/zedagent.go index 6b6f5053d8..fe2e562f15 100644 --- a/pkg/pillar/cmd/zedagent/zedagent.go +++ b/pkg/pillar/cmd/zedagent/zedagent.go @@ -177,18 +177,21 @@ type zedagentContext struct { // This is the value of counter that triggered reboot. This is sent in // device info msg. Can be used to verify device is caught up on all // outstanding reboot commands from cloud. - rebootConfigCounter uint32 - shutdownConfigCounter uint32 - subDevicePortConfigList pubsub.Subscription - DevicePortConfigList *types.DevicePortConfigList - remainingTestTime time.Duration - physicalIoAdapterMap map[string]types.PhysicalIOAdapter - globalConfig types.ConfigItemValueMap - globalConfigPublished bool // was last globalConfig successfully published - specMap types.ConfigItemSpecMap - globalStatus types.GlobalStatus - flowLogMetrics types.FlowlogMetrics - appContainerStatsTime time.Time // last time the App Container stats uploaded + rebootConfigCounter uint32 + shutdownConfigCounter uint32 + // Part of the fields above (the reboot ones) are initialized only once the NodeAgent status is received + // This flag is used to make sure we initialize them before continuing with the rest of the agent's initialization + initializedFromNodeAgentStatus bool + subDevicePortConfigList pubsub.Subscription + DevicePortConfigList *types.DevicePortConfigList + remainingTestTime time.Duration + physicalIoAdapterMap map[string]types.PhysicalIOAdapter + globalConfig types.ConfigItemValueMap + globalConfigPublished bool // was last globalConfig successfully published + specMap types.ConfigItemSpecMap + globalStatus types.GlobalStatus + flowLogMetrics types.FlowlogMetrics + appContainerStatsTime time.Time // last time the App Container stats uploaded // The MaintenanceMode can come from GlobalConfig and from the config // API. Those are merged into maintenanceMode // TBD will be also decide locally to go into maintenanceMode based @@ -392,7 +395,7 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar reinitNetdumper(zedagentCtx) // We know our own UUID; prepare for communication with controller - zedcloudCtx = initZedcloudContext( + zedcloudCtx = initZedcloudContext(getconfigCtx, zedagentCtx.globalConfig.GlobalValueInt(types.NetworkSendTimeout), zedagentCtx.globalConfig.GlobalValueInt(types.NetworkDialTimeout), zedagentCtx.zedcloudMetrics) @@ -430,6 +433,11 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar // With device UUID, zedagent is ready to initialize and activate all subscriptions. initPostOnboardSubs(zedagentCtx) + // Wait until we initialize the context from node agent status. + // At least we need to be sure the bootReason field is set properly, as it's used during fetching local config, + // when it's necessary (necessary or not is determined exactly by the bootReason). + waitUntilInitializedFromNodeAgentStatus(zedagentCtx, stillRunning) + //initialize cipher processing block cipherModuleInitialize(zedagentCtx) @@ -519,6 +527,19 @@ func Run(ps *pubsub.PubSub, loggerArg *logrus.Logger, logArg *base.LogObject, ar return 0 } +func waitUntilInitializedFromNodeAgentStatus(ctx *zedagentContext, running *time.Ticker) { + log.Functionf("waitUntilInitializedFromNodeAgentStatus()") + for !ctx.initializedFromNodeAgentStatus { + select { + case change := <-ctx.getconfigCtx.subNodeAgentStatus.MsgChan(): + ctx.getconfigCtx.subNodeAgentStatus.ProcessChange(change) + case <-running.C: + } + ctx.ps.StillRunning(agentName, warningTime, errorTime) + } + log.Functionf("waitUntilInitializedFromNodeAgentStatus() done") +} + func (zedagentCtx *zedagentContext) init() { zedagentCtx.zedcloudMetrics = zedcloud.NewAgentMetrics() zedagentCtx.specMap = types.NewConfigItemSpecMap() @@ -986,6 +1007,9 @@ func mainEventLoop(zedagentCtx *zedagentContext, stillRunning *time.Ticker) { case change := <-zedagentCtx.subZFSPoolMetrics.MsgChan(): zedagentCtx.subZFSPoolMetrics.ProcessChange(change) + case change := <-getconfigCtx.subCachedResolvedIPs.MsgChan(): + getconfigCtx.subCachedResolvedIPs.ProcessChange(change) + case <-hwInfoTiker.C: triggerPublishHwInfo(zedagentCtx) @@ -1825,6 +1849,18 @@ func initPostOnboardSubs(zedagentCtx *zedagentContext) { if err != nil { log.Fatal(err) } + + getconfigCtx.subCachedResolvedIPs, err = ps.NewSubscription(pubsub.SubscriptionOptions{ + AgentName: "nim", + MyAgentName: agentName, + WarningTime: warningTime, + ErrorTime: errorTime, + TopicImpl: types.CachedResolvedIPs{}, + Activate: true, + }) + if err != nil { + log.Fatal(err) + } } func triggerPublishHwInfoToDest(ctxPtr *zedagentContext, dest destinationBitset) { @@ -2326,6 +2362,8 @@ func handleNodeAgentStatusImpl(ctxArg interface{}, key string, ctx.bootReason = status.BootReason ctx.restartCounter = status.RestartCounter ctx.allDomainsHalted = status.AllDomainsHalted + // Mark that we have received the NodeAgentStatus and initialized the context properly + ctx.initializedFromNodeAgentStatus = true // if config reboot command was initiated and // was deferred, and the device is not in inprogress // state, initiate the reboot process diff --git a/pkg/pillar/dpcmanager/verify.go b/pkg/pillar/dpcmanager/verify.go index c81b758d3b..984e54f696 100644 --- a/pkg/pillar/dpcmanager/verify.go +++ b/pkg/pillar/dpcmanager/verify.go @@ -338,7 +338,7 @@ func (m *DpcManager) verifyDPC(ctx context.Context) (status types.DPCState) { } m.Log.Errorf("DPC verify: no IP/DNS: exceeded timeout (waited for %v): "+ "%v for %+v\n", elapsed, err, dpc) - dpc.RecordFailure(err.Error()) + dpc.RecordFailure(unwrapPortsNotReady(err).Error()) status = types.DPCStateFail dpc.State = status return status @@ -362,7 +362,7 @@ func (m *DpcManager) verifyDPC(ctx context.Context) (status types.DPCState) { } m.Log.Errorf("DPC verify: ports %v are not ready: exceeded timeout (waited for %v): "+ "%v for %+v\n", notReadyErr.Ports, elapsed, err, dpc) - dpc.RecordFailure(err.Error()) + dpc.RecordFailure(unwrapPortsNotReady(err).Error()) status = types.DPCStateFailWithIPAndDNS dpc.State = status return status @@ -577,3 +577,16 @@ func (m *DpcManager) checkMgmtPortsPresence() (available, missing []string) { } return available, missing } + +// If error returned from connectivity test was wrapped into PortsNotReady, +// unwrap it before recording it into DeviceNetworkStatus and DPCL. +// PortsNotReady error type is only useful between ConnectivityTester and DPC +// Manager to determine next steps in the connectivity testing process, +// but otherwise in wider context it produces somewhat confusing error +// message for users. +func unwrapPortsNotReady(err error) error { + if pnrErr, isPNRErr := err.(*conntester.PortsNotReady); isPNRErr { + return pnrErr.Unwrap() + } + return err +} diff --git a/pkg/pillar/nireconciler/linux_config.go b/pkg/pillar/nireconciler/linux_config.go index c94a8c0be3..c8b4a63bce 100644 --- a/pkg/pillar/nireconciler/linux_config.go +++ b/pkg/pillar/nireconciler/linux_config.go @@ -614,6 +614,7 @@ func (r *LinuxNIReconciler) getIntendedNIL3Cfg(niID uuid.UUID) dg.Graph { LogAndErrPrefix, outIfIndex, err) continue } + // Copy routes from the main table into the NI-specific table. for _, rt := range routes { rtCopy := rt.Data.(netlink.Route) rtCopy.Table = dstTable @@ -627,8 +628,9 @@ func (r *LinuxNIReconciler) getIntendedNIL3Cfg(niID uuid.UUID) dg.Graph { } rtCopy.Protocol = unix.RTPROT_STATIC intendedL3Cfg.PutItem(linux.Route{ - Route: rtCopy, - OutputIf: rtOutIf, + Route: rtCopy, + OutputIf: rtOutIf, + GwViaLinkRoute: gwViaLinkRoute(rt, routes), }, nil) } } @@ -1051,6 +1053,23 @@ func (r *LinuxNIReconciler) getNISubnet(ni *niInfo) *net.IPNet { } } +// gwViaLinkRoute returns true if the given route uses gateway routed by another +// link-scoped route. +func gwViaLinkRoute(route netmonitor.Route, routingTable []netmonitor.Route) bool { + if len(route.Gw) == 0 { + return false + } + gwHostSubnet := devicenetwork.HostSubnet(route.Gw) + for _, route2 := range routingTable { + netlinkRoute2 := route2.Data.(netlink.Route) + if netlinkRoute2.Scope == netlink.SCOPE_LINK && + utils.EqualIPNets(netlinkRoute2.Dst, gwHostSubnet) { + return true + } + } + return false +} + // HostIPSetBasename returns basename (without the "ipvX." prefix) to use for ipset // matching a given domain name (ACE match of type "host"). // Needs to ensure that the ipset name doesn't exceed the length diff --git a/pkg/pillar/nireconciler/linux_currentstate.go b/pkg/pillar/nireconciler/linux_currentstate.go index f2b15da449..a63d01dfc9 100644 --- a/pkg/pillar/nireconciler/linux_currentstate.go +++ b/pkg/pillar/nireconciler/linux_currentstate.go @@ -269,8 +269,9 @@ func (r *LinuxNIReconciler) updateCurrentNIRoutes(niID uuid.UUID) (changed bool) } for _, rt := range routes { route := linux.Route{ - Route: rt.Data.(netlink.Route), - OutputIf: rtOutIf, + Route: rt.Data.(netlink.Route), + OutputIf: rtOutIf, + GwViaLinkRoute: gwViaLinkRoute(rt, routes), } prevRoute := prevRoutes[dg.Reference(route)] if prevRoute == nil || !prevRoute.Equal(route) { diff --git a/pkg/pillar/nireconciler/linuxitems/route.go b/pkg/pillar/nireconciler/linuxitems/route.go index 098167abc4..81be15c45e 100644 --- a/pkg/pillar/nireconciler/linuxitems/route.go +++ b/pkg/pillar/nireconciler/linuxitems/route.go @@ -12,6 +12,7 @@ import ( dg "github.com/lf-edge/eve/libs/depgraph" "github.com/lf-edge/eve/pkg/pillar/base" + "github.com/lf-edge/eve/pkg/pillar/devicenetwork" "github.com/lf-edge/eve/pkg/pillar/netmonitor" "github.com/lf-edge/eve/pkg/pillar/nireconciler/genericitems" "github.com/vishvananda/netlink" @@ -31,6 +32,9 @@ type Route struct { // OutputIf : output interface for the routed traffic. // Leave undefined if the destination is unreachable. OutputIf RouteOutIf + // GwViaLinkRoute is set to true if gateway is not included in the output interface + // subnet and therefore depends on a link route (RT_SCOPE_LINK) for reachability. + GwViaLinkRoute bool } // RouteOutIf : output interface for the route. @@ -143,7 +147,8 @@ func (r Route) Equal(other dg.Item) bool { return false } return r.normalizedNetlinkRoute().Equal(r2.normalizedNetlinkRoute()) && - r.OutputIf == r2.OutputIf + r.OutputIf == r2.OutputIf && + r.GwViaLinkRoute == r2.GwViaLinkRoute } // External returns false. @@ -184,7 +189,7 @@ func (r Route) Dependencies() (deps []dg.Dependency) { return false } } - if len(r.Gw) != 0 { + if !r.GwViaLinkRoute && len(r.Gw) != 0 { var gwMatch bool for _, ip := range ips { if ip.Contains(r.Gw) { @@ -238,6 +243,27 @@ func (r Route) Dependencies() (deps []dg.Dependency) { Description: "Dummy interface must exist and have matching IP address assigned", }) } + if r.GwViaLinkRoute && len(r.Gw) != 0 { + // Link route for the gateway must be configured first. + deps = append(deps, dg.Dependency{ + RequiredItem: dg.Reference(Route{ + Route: netlink.Route{ + Family: r.Family, + Table: r.Table, + Dst: devicenetwork.HostSubnet(r.Gw)}, + OutputIf: r.OutputIf, + }), + MustSatisfy: func(item dg.Item) bool { + gwRoute, isRoute := item.(Route) + if !isRoute { + // Should be unreachable + return false + } + return gwRoute.Scope == netlink.SCOPE_LINK + }, + Description: "Link route for the gateway must be configured first", + }) + } return deps } diff --git a/pkg/pillar/types/zedroutertypes.go b/pkg/pillar/types/zedroutertypes.go index 1e5deb8e79..8fb40a8f3d 100644 --- a/pkg/pillar/types/zedroutertypes.go +++ b/pkg/pillar/types/zedroutertypes.go @@ -11,6 +11,7 @@ import ( "os" "reflect" "sort" + "strings" "time" "github.com/eriknordmark/ipinfo" @@ -3468,3 +3469,69 @@ type AppBlobsAvailable struct { type AppInfo struct { AppBlobs []AppBlobsAvailable } + +// CachedIP : cached IP with time-limited validity. +type CachedIP struct { + IPAddress net.IP + ValidUntil time.Time +} + +// String representation of CachedIP. +func (c CachedIP) String() string { + return fmt.Sprintf("IP %s valid until %v", c.IPAddress, c.ValidUntil) +} + +// CachedResolvedIPs serves as a cache for storing the IP addresses obtained through +// DNS resolution for a given hostname. +type CachedResolvedIPs struct { + Hostname string + CachedIPs []CachedIP +} + +// String representation of CachedResolvedIPs. +func (c CachedResolvedIPs) String() string { + var cachedIPs []string + for _, ip := range c.CachedIPs { + cachedIPs = append(cachedIPs, ip.String()) + } + return fmt.Sprintf("Hostname %s with cached resolved IPs: [%s]", c.Hostname, + strings.Join(cachedIPs, ", ")) +} + +// Key is used for pubsub +func (c CachedResolvedIPs) Key() string { + return c.Hostname +} + +// LogCreate : +func (c CachedResolvedIPs) LogCreate(logBase *base.LogObject) { + logObject := base.NewLogObject(logBase, base.CachedResolvedIPsLogType, "", + nilUUID, c.LogKey()) + logObject.Metricf("CachedResolvedIPs create %s", c.String()) +} + +// LogModify : +func (c CachedResolvedIPs) LogModify(logBase *base.LogObject, old interface{}) { + logObject := base.EnsureLogObject(logBase, base.CachedResolvedIPsLogType, "", + nilUUID, c.LogKey()) + oldVal, ok := old.(CachedResolvedIPs) + if !ok { + logObject.Clone().Fatalf( + "LogModify: Old object interface passed is not of CachedResolvedIPs type") + } + logObject.Metricf("CachedResolvedIPs modified from %s to %s", + oldVal.String(), c.String()) +} + +// LogDelete : +func (c CachedResolvedIPs) LogDelete(logBase *base.LogObject) { + logObject := base.EnsureLogObject(logBase, base.CachedResolvedIPsLogType, "", + nilUUID, c.LogKey()) + logObject.Metricf("CachedResolvedIPs delete %s", c.String()) + base.DeleteLogObject(logBase, c.LogKey()) +} + +// LogKey : +func (c CachedResolvedIPs) LogKey() string { + return string(base.CachedResolvedIPsLogType) + "-" + c.Key() +} diff --git a/pkg/pillar/utils/ip.go b/pkg/pillar/utils/ip.go index 43c735afec..1e93d73894 100644 --- a/pkg/pillar/utils/ip.go +++ b/pkg/pillar/utils/ip.go @@ -27,3 +27,10 @@ func EqualIPNets(ipNet1, ipNet2 *net.IPNet) bool { return ipNet1.IP.Equal(ipNet2.IP) && bytes.Equal(ipNet1.Mask, ipNet2.Mask) } + +// SameIPVersions returns true if both IP addresses are of the same version +func SameIPVersions(ip1, ip2 net.IP) bool { + firstIsV4 := ip1.To4() != nil + secondIsV4 := ip2.To4() != nil + return firstIsV4 == secondIsV4 +} diff --git a/pkg/pillar/zedcloud/send.go b/pkg/pillar/zedcloud/send.go index 9a02fb9acb..9c4ade7b82 100644 --- a/pkg/pillar/zedcloud/send.go +++ b/pkg/pillar/zedcloud/send.go @@ -50,6 +50,7 @@ type ZedCloudContext struct { TlsConfig *tls.Config FailureFunc func(log *base.LogObject, intf string, url string, reqLen int64, respLen int64, authFail bool) SuccessFunc func(log *base.LogObject, intf string, url string, reqLen int64, respLen int64, timeSpent int64, resume bool) + ResolverCacheFunc ResolverCacheFunc NoLedManager bool // Don't call UpdateLedManagerConfig DevUUID uuid.UUID DevSerial string @@ -82,17 +83,23 @@ type ZedCloudContext struct { // ContextOptions - options to be passed at NewContext type ContextOptions struct { - DevNetworkStatus *types.DeviceNetworkStatus - TLSConfig *tls.Config - AgentMetrics *AgentMetrics - SendTimeout uint32 - DialTimeout uint32 - Serial string - SoftSerial string - AgentName string // XXX replace by NoLogFailures? - NetTraceOpts []nettrace.TraceOpt + DevNetworkStatus *types.DeviceNetworkStatus + TLSConfig *tls.Config + AgentMetrics *AgentMetrics + SendTimeout uint32 + DialTimeout uint32 + Serial string + SoftSerial string + AgentName string // XXX replace by NoLogFailures? + NetTraceOpts []nettrace.TraceOpt + ResolverCacheFunc ResolverCacheFunc } +// ResolverCacheFunc is a callback that the caller may provide to give access +// to cached resolved IP addresses. SendOnIntf will try to use the cached IPs +// to avoid unnecessary DNS lookups. +type ResolverCacheFunc func(hostname string) []types.CachedIP + // SendAttempt - single attempt to send data made by SendOnIntf function. type SendAttempt struct { // Non-nil if the attempt failed. @@ -253,8 +260,8 @@ func SendOnAllIntf(ctxWork context.Context, ctx *ZedCloudContext, url string, re combinedRV.RespContents = rv.RespContents return combinedRV, nil } - errStr := fmt.Sprintf("All attempts to connect to %s failed: %v", - url, attempts) + errStr := fmt.Sprintf("All attempts to connect to %s failed: %s", + url, describeSendAttempts(attempts)) log.Errorln(errStr) err := &SendError{ Err: errors.New(errStr), @@ -431,8 +438,8 @@ func VerifyAllIntf(ctx *ZedCloudContext, url string, requiredSuccessCount uint, return verifyRV, err } if intfSuccessCount == 0 { - errStr := fmt.Sprintf("All attempts to connect to %s failed: %v", - url, attempts) + errStr := fmt.Sprintf("All attempts to connect to %s failed: %s", + url, describeSendAttempts(attempts)) log.Errorln(errStr) err := &SendError{ Err: errors.New(errStr), @@ -457,6 +464,146 @@ func VerifyAllIntf(ctx *ZedCloudContext, url string, requiredSuccessCount uint, return verifyRV, nil } +// resolverWithLocalIP extends net.Resolver to allow to define local IP for DNS queries +// and a callback to skip some DNS servers. The callback is used by SendOnIntf to filter +// out DNS servers which should not be used for the given interface. +type resolverWithLocalIP struct { + log *base.LogObject + ifName string + localIP net.IP + skipNs nettrace.NameserverSelector + // Output flags used by dialerWithResolverCache to determine appropriate error + // for failed Dial. + dialRequested bool + dnsWasAvail bool +} + +func (r *resolverWithLocalIP) resolverDial( + ctx context.Context, network, address string) (net.Conn, error) { + if r.log != nil { + r.log.Tracef("resolverDial %v %v", network, address) + } + r.dialRequested = true + dnsHost, _, err := net.SplitHostPort(address) + if err != nil { + // No port in the address. + dnsHost = address + } + dnsIP := net.ParseIP(dnsHost) + if dnsIP == nil { + return nil, fmt.Errorf("failed to parse DNS IP address '%s'", dnsHost) + } + if dnsIP.IsLoopback() { + // 127.0.0.1:53 is tried by Golang resolver when resolv.conf does not contain + // any nameservers (see defaultNS in net/dnsconfig_unix.go). + // There is no point in looking for DNS server on the loopback interface on EVE. + return nil, &types.DNSNotAvail{IfName: r.ifName} + } + // Note that port number is not looked at by skipNs. + if r.skipNs != nil { + if skip, reason := r.skipNs(dnsIP, 0); skip { + return nil, fmt.Errorf("skipped nameserver %v: %s", dnsIP, reason) + } + } + r.dnsWasAvail = true + switch network { + case "udp", "udp4", "udp6": + d := net.Dialer{LocalAddr: &net.UDPAddr{IP: r.localIP}} + return d.Dial(network, address) + case "tcp", "tcp4", "tcp6": + d := net.Dialer{LocalAddr: &net.TCPAddr{IP: r.localIP}} + return d.Dial(network, address) + default: + return nil, fmt.Errorf("unsupported address type: %v", network) + } +} + +// Return resolverWithLocalIP functionality wrapped inside the standard net.Resolver type. +func (r *resolverWithLocalIP) getNetResolver() *net.Resolver { + return &net.Resolver{Dial: r.resolverDial, PreferGo: true, StrictErrors: false} +} + +// dialerWithResolverCache provides DialContext function just like regular net.Dialer. +// The difference is that it will try to avoid DNS query if the target hostname IP is already +// resolved and stored in the cache. +// If dialing the cached IP fails, dialer will fall back to using regular dial, performing +// hostname IP resolution using available DNS servers. +type dialerWithResolverCache struct { + log *base.LogObject + ifName string + localIP net.IP + skipNs nettrace.NameserverSelector + timeout time.Duration + resolverCache ResolverCacheFunc +} + +// DialContext : extends net.DialContext to first try dialing using a cached IP if available. +// Only if that fails, the standard DialContext is called. +func (d *dialerWithResolverCache) DialContext( + ctx context.Context, network, address string) (net.Conn, error) { + if d.log != nil { + d.log.Tracef("DialContext %v %v", network, address) + } + resolver := resolverWithLocalIP{ + log: d.log, + ifName: d.ifName, + localIP: d.localIP, + skipNs: d.skipNs, + } + stdDialer := net.Dialer{ + Resolver: resolver.getNetResolver(), + LocalAddr: &net.TCPAddr{IP: d.localIP}, + Timeout: d.timeout, + } + host, port, err := net.SplitHostPort(address) + if err != nil { + host = address + port = "" + } + var cachedLookup []types.CachedIP + if d.resolverCache != nil { + cachedLookup = d.resolverCache(host) + } + for _, cachedEntry := range cachedLookup { + if time.Now().After(cachedEntry.ValidUntil) { + continue + } + if d.localIP != nil && + !utils.SameIPVersions(cachedEntry.IPAddress, d.localIP) { + continue + } + var addrWithIP string + if port == "" { + addrWithIP = cachedEntry.IPAddress.String() + } else { + addrWithIP = net.JoinHostPort(cachedEntry.IPAddress.String(), port) + } + conn, err := stdDialer.DialContext(ctx, network, addrWithIP) + if err == nil { + return conn, nil + } + } + // Fall back to using the regular dialer. + conn, err := stdDialer.DialContext(ctx, network, address) + if err != nil { + // Find out if dial failed because there was no DNS server available. + // Even though SendOnIntf checks if there are any DNS servers available + // for the given interface in DeviceNetworkStatus before using this dialer, + // there might be a delay between config being written to /etc/resolv.conf + // and the Golang resolver reloading it. More info about this can be found + // in pillar/dpcmanager/verify.go, function verifyDPC. + // Note that even with empty resolv.conf, Golang resolver will try at least + // 127.0.0.1:53, so dialRequested=true means that hostname IP resolution was + // needed (not using cached IP or /etc/hosts). + // dnsWasAvail is set after filtering out DNS servers which are not valid + // for the given interface (servers from other interfaces and the loopback IP). + if resolver.dialRequested && !resolver.dnsWasAvail { + err = &types.DNSNotAvail{IfName: d.ifName} + } + } + return conn, err +} + // SendOnIntf : Tries all source addresses on interface until one succeeds. // Returns response for first success. Caller can not use SendRetval.HTTPResp.Body but can // use SendRetval.RespContents contents return. @@ -717,12 +864,11 @@ func SendOnIntf(workContext context.Context, ctx *ZedCloudContext, destURL strin tracedClient *nettrace.HTTPClient tracedReqName string tracedReqDescr string - dnsIsAvail bool - // Did the domain name resolution used IP address cached in /etc/hosts - // (see pillar/cmd/nim/controllerdns.go) - fromDNSCache bool ) if withNetTracing { + // Note that resolver cache is not supported when network tracing is enabled. + // This is actually intentional - when tracing, we want to run normal hostname + // IP resolution and collect traces of DNS queries. tracedClient, err = nettrace.NewHTTPClient(clientConfig, ctx.NetTraceOpts...) if err != nil { log.Errorf("SendOnIntf: nettrace.NewHTTPClient failed: %v\n", err) @@ -741,32 +887,15 @@ func SendOnIntf(workContext context.Context, ctx *ZedCloudContext, destURL strin tracedReqDescr = fmt.Sprintf("%s %s via %s src IP %v", reqMethod, reqURL, intf, localAddr) } else { - fromDNSCache = true // set to false by resolverDial below - localTCPAddr := net.TCPAddr{IP: localAddr} - localUDPAddr := net.UDPAddr{IP: localAddr} - log.Tracef("Connecting to %s using intf %s source %v\n", - reqURL, intf, localTCPAddr) - resolverDial := func(ctx context.Context, network, address string) (net.Conn, error) { - log.Tracef("resolverDial %v %v", network, address) - fromDNSCache = false - dnsIP := net.ParseIP(strings.Split(address, ":")[0]) - // Note that port number is not looked at by skipNs. - skip, reason := skipNs(dnsIP, 0) - if skip { - return nil, fmt.Errorf("skipped nameserver %v: %s", dnsIP, reason) - } - dnsIsAvail = true - // XXX can we fallback to TCP? Would get a mismatched address if we do - d := net.Dialer{LocalAddr: &localUDPAddr} - return d.Dial(network, address) - } - r := net.Resolver{Dial: resolverDial, PreferGo: true, StrictErrors: false} - d := net.Dialer{ - Resolver: &r, - LocalAddr: &localTCPAddr, - Timeout: clientConfig.TCPHandshakeTimeout, + dialer := &dialerWithResolverCache{ + log: log, + ifName: intf, + localIP: localAddr, + skipNs: skipNs, + timeout: clientConfig.TCPHandshakeTimeout, + resolverCache: ctx.ResolverCacheFunc, } - transport.DialContext = d.DialContext + transport.DialContext = dialer.DialContext client = &http.Client{Transport: transport, Timeout: clientConfig.ReqTimeout} } @@ -779,34 +908,35 @@ func SendOnIntf(workContext context.Context, ctx *ZedCloudContext, destURL strin if withPCAP { time.Sleep(pcapDelay) } - netTrace, pcaps, err := tracedClient.GetTrace(tracedReqDescr) - if err != nil { - log.Error(err) + netTrace, pcaps, err2 := tracedClient.GetTrace(tracedReqDescr) + if err2 != nil { + log.Error(err2) } else { rv.TracedReqs = append(rv.TracedReqs, netdump.TracedNetRequest{ RequestName: tracedReqName, NetTrace: netTrace, PacketCaptures: pcaps, }) - // Determine dnsIsAvail and fromDNSCache using Dial traces. - fromDNSCache = true + // Find out if dial failed because there was no DNS server available. + var calledResolver, dnsWasAvail bool for _, dialTrace := range netTrace.Dials { if len(dialTrace.ResolverDials) > 0 { - dnsIsAvail = true - fromDNSCache = false + dnsWasAvail = true + calledResolver = true } if len(dialTrace.SkippedNameservers) > 0 { - fromDNSCache = false + calledResolver = true } } + if calledResolver && !dnsWasAvail { + err = &types.DNSNotAvail{IfName: intf} + } } - if err = tracedClient.Close(); err != nil { - log.Error(err) + if err2 = tracedClient.Close(); err2 != nil { + log.Error(err2) } } - if !fromDNSCache && !dnsIsAvail { - attempt.Err = &types.DNSNotAvail{IfName: intf} - } else if cf, cert := isCertFailure(err); cf { + if cf, cert := isCertFailure(err); cf { // XXX can we ever get this from a proxy? // We assume we reached the controller here log.Errorf("client.Do fail: certFailure") @@ -889,8 +1019,8 @@ func SendOnIntf(workContext context.Context, ctx *ZedCloudContext, destURL strin // Handle failure to read HTTP response body. if readErr != nil { log.Errorf("ReadAll (timeout %d) failed: %s", - ctx.NetworkSendTimeout, err) - attempt.Err = err + ctx.NetworkSendTimeout, readErr) + attempt.Err = readErr attempts = append(attempts, attempt) continue } @@ -987,8 +1117,8 @@ func SendOnIntf(workContext context.Context, ctx *ZedCloudContext, destURL strin if ctx.FailureFunc != nil { ctx.FailureFunc(log, intf, reqURL, 0, 0, false) } - errStr := fmt.Sprintf("All attempts to connect to %s failed: %v", - reqURL, attempts) + errStr := fmt.Sprintf("All attempts to connect to %s failed: %s", + reqURL, describeSendAttempts(attempts)) log.Errorln(errStr) err = &SendError{ Err: errors.New(errStr), @@ -1025,31 +1155,14 @@ func SendLocal(ctx *ZedCloudContext, destURL string, intf string, ipSrc net.IP, // Since we recreate the transport on each call there is no benefit // to keeping the connections open. defer transport.CloseIdleConnections() - - // Try all addresses - localTCPAddr := net.TCPAddr{IP: ipSrc} - localUDPAddr := net.UDPAddr{IP: ipSrc} - resolverDial := func(ctx context.Context, network, address string) (net.Conn, error) { - log.Tracef("resolverDial %v %v", network, address) - switch network { - case "udp", "udp4", "udp6": - d := net.Dialer{LocalAddr: &localUDPAddr} - return d.Dial(network, address) - case "tcp", "tcp4", "tcp6": - d := net.Dialer{LocalAddr: &localTCPAddr} - return d.Dial(network, address) - default: - return nil, fmt.Errorf("unsupported address type: %v", network) - } + dialer := &dialerWithResolverCache{ + log: log, + ifName: intf, + localIP: ipSrc, + timeout: time.Duration(ctx.NetworkDialTimeout) * time.Second, + resolverCache: ctx.ResolverCacheFunc, } - r := net.Resolver{Dial: resolverDial, PreferGo: true, - StrictErrors: false} - d := net.Dialer{ - Resolver: &r, - LocalAddr: &localTCPAddr, - Timeout: time.Duration(ctx.NetworkDialTimeout) * time.Second, - } - transport.Dial = d.Dial + transport.DialContext = dialer.DialContext client := &http.Client{Transport: transport} if ctx.NetworkSendTimeout != 0 { @@ -1240,12 +1353,37 @@ func isECONNREFUSED(err error) bool { return errno == syscall.ECONNREFUSED } +// Describe send attempts in a concise and readable form. +func describeSendAttempts(attempts []SendAttempt) string { + var attemptDescriptions []string + for _, attempt := range attempts { + var description string + // Unwrap errors defined here in pillar to avoid stutter. + // Instead of "send via eth1: interface eth1: no DNS server available", + // we simply return "interface eth1: no DNS server available". + // Same for IPAddrNotAvail. + // Otherwise, the errors are of the form: + // "send via eth1 [with src IP ]: " + switch err := attempt.Err.(type) { + case *types.DNSNotAvail: + description = err.Error() + case *types.IPAddrNotAvail: + description = err.Error() + default: + description = attempt.String() + } + attemptDescriptions = append(attemptDescriptions, description) + } + return strings.Join(attemptDescriptions, "; ") +} + // NewContext - return initialized cloud context func NewContext(log *base.LogObject, opt ContextOptions) ZedCloudContext { ctx := ZedCloudContext{ DeviceNetworkStatus: opt.DevNetworkStatus, NetworkSendTimeout: opt.SendTimeout, NetworkDialTimeout: opt.DialTimeout, + ResolverCacheFunc: opt.ResolverCacheFunc, TlsConfig: opt.TLSConfig, V2API: UseV2API(), DevSerial: opt.Serial, diff --git a/tools/makeflash.sh b/tools/makeflash.sh index 179d7ba7c7..61c033c31c 100755 --- a/tools/makeflash.sh +++ b/tools/makeflash.sh @@ -1,11 +1,12 @@ #!/bin/sh # Usage: # -# ./makeflash.sh [-C size] [partitions] +# ./makeflash.sh [-C size] [partitions] # EVE="$(cd "$(dirname "$0")" && pwd)/../" PATH="$EVE/build-tools/bin:$PATH" -MKFLASH_TAG="$(linuxkit pkg show-tag "$EVE/pkg/mkimage-raw-efi")" +MKFLASH_TAG="$(linuxkit pkg show-tag "$EVE/pkg/$1")" +shift 1 if [ "$1" = "-C" ]; then SIZE="$2" diff --git a/tools/makeverification.sh b/tools/makeverification.sh deleted file mode 100755 index f8307fe70f..0000000000 --- a/tools/makeverification.sh +++ /dev/null @@ -1,22 +0,0 @@ -#!/bin/sh -# Usage: -# -# ./makeflash.sh [-C size] [partitions] -# -EVE="$(cd "$(dirname "$0")" && pwd)/../" -PATH="$EVE/build-tools/bin:$PATH" -MKFLASH_TAG="$(linuxkit pkg show-tag "$EVE/pkg/mkverification-raw-efi")" - -if [ "$1" = "-C" ]; then - SIZE="$2" - dd if=/dev/zero of="$4" seek=$(( SIZE * 1024 * 1024 - 1)) bs=1 count=1 - # If we're a non-root user, the bind mount gets permissions sensitive. - # So we go docker^Wcowboy style - chmod ugo+w "$4" - shift 2 -fi - -SOURCE="$(cd "$1" && pwd)" -IMAGE="$(cd "$(dirname "$2")" && pwd)/$(basename "$2")" -shift 2 -docker run --rm -v "$SOURCE:/parts" -v "$IMAGE:/output.img" "$MKFLASH_TAG" /output.img "$@"