From 05033bc8caee6247fb35c29d3e52ea0523bf9c12 Mon Sep 17 00:00:00 2001 From: Humble Devassy Chirammal Date: Wed, 10 Jun 2026 14:04:57 +0530 Subject: [PATCH 1/2] Update node-problem-detector to v1.35.2 and remove addon manifests Update node-problem-detector from v1.34.0 to v1.35.2 and remove all related addon manifests and install logic that is no longer needed: - Update version in build/dependencies.yaml, test/e2e_node/image_list.go and test/kubemark/resources/hollow-node_template.yaml. - Remove cluster/addons/node-problem-detector/ entirely. No e2e tests depend on these manifests: e2e_node tests create NPD pods inline and GCE standalone mode runs NPD as a systemd service. - Remove install-node-problem-detector function and DEFAULT_NPD_* vars from cluster/gce/gci/configure.sh along with the conditional that invoked it, since NPD is no longer installed as a standalone binary via this script. - Remove the setup-addon-manifests calls for node-problem-detector from cluster/gce/gci/configure-helper.sh since the source directory no longer exists. - Remove stale refPaths in build/dependencies.yaml that pointed to the deleted addon files. Signed-off-by: Humble Devassy Chirammal --- build/dependencies.yaml | 13 +-- .../node-problem-detector/MAINTAINERS.md | 4 - cluster/addons/node-problem-detector/OWNERS | 8 -- .../addons/node-problem-detector/README.md | 7 -- .../kubelet-user-standalone/npd-binding.yaml | 15 --- cluster/addons/node-problem-detector/npd.yaml | 96 ------------------- .../standalone/npd-binding.yaml | 15 --- cluster/gce/gci/configure-helper.sh | 10 -- cluster/gce/gci/configure.sh | 59 +----------- test/e2e_node/image_list.go | 2 +- .../resources/hollow-node_template.yaml | 2 +- 11 files changed, 4 insertions(+), 227 deletions(-) delete mode 100644 cluster/addons/node-problem-detector/MAINTAINERS.md delete mode 100644 cluster/addons/node-problem-detector/OWNERS delete mode 100644 cluster/addons/node-problem-detector/README.md delete mode 100644 cluster/addons/node-problem-detector/kubelet-user-standalone/npd-binding.yaml delete mode 100644 cluster/addons/node-problem-detector/npd.yaml delete mode 100644 cluster/addons/node-problem-detector/standalone/npd-binding.yaml diff --git a/build/dependencies.yaml b/build/dependencies.yaml index bdb87d7b62d..de5da0a4e9d 100644 --- a/build/dependencies.yaml +++ b/build/dependencies.yaml @@ -109,23 +109,12 @@ dependencies: match: registry.k8s.io/e2e-test-images/agnhost:\d+\.\d+\.\d+ - name: "node-problem-detector" - version: 1.34.0 + version: 1.35.2 refPaths: - path: test/e2e_node/image_list.go match: const defaultImage - path: test/kubemark/resources/hollow-node_template.yaml match: registry.k8s.io/node-problem-detector/node-problem-detector - - path: cluster/addons/node-problem-detector/npd.yaml - match: registry.k8s.io/node-problem-detector/node-problem-detector - - path: cluster/addons/node-problem-detector/npd.yaml - match: app.kubernetes.io/version - # TODO(dims): Ensure newer versions get uploaded to - # - https://console.cloud.google.com/storage/browser/gke-release/winnode/node-problem-detector - # - https://gcsweb.k8s.io/gcs/kubernetes-release/node-problem-detector/ - # and then the following references get fixed. - # - - path: cluster/gce/gci/configure.sh - match: DEFAULT_NPD_VERSION= #- path: cluster/gce/windows/k8s-node-setup.psm1 # match: DEFAULT_NPD_VERSION diff --git a/cluster/addons/node-problem-detector/MAINTAINERS.md b/cluster/addons/node-problem-detector/MAINTAINERS.md deleted file mode 100644 index d2fd3b65127..00000000000 --- a/cluster/addons/node-problem-detector/MAINTAINERS.md +++ /dev/null @@ -1,4 +0,0 @@ -# Maintainers - -Random-Liu -wangzhen127 diff --git a/cluster/addons/node-problem-detector/OWNERS b/cluster/addons/node-problem-detector/OWNERS deleted file mode 100644 index 66cec0a3619..00000000000 --- a/cluster/addons/node-problem-detector/OWNERS +++ /dev/null @@ -1,8 +0,0 @@ -# See the OWNERS docs at https://go.k8s.io/owners - -approvers: - - Random-Liu - - wangzhen127 -reviewers: - - Random-Liu - - wangzhen127 diff --git a/cluster/addons/node-problem-detector/README.md b/cluster/addons/node-problem-detector/README.md deleted file mode 100644 index 220ab1315d0..00000000000 --- a/cluster/addons/node-problem-detector/README.md +++ /dev/null @@ -1,7 +0,0 @@ -# Node Problem Detector -============== - -Node Problem Detector is a DaemonSet running on each node, detecting node -problems. - -Learn more at: https://github.com/kubernetes/node-problem-detector diff --git a/cluster/addons/node-problem-detector/kubelet-user-standalone/npd-binding.yaml b/cluster/addons/node-problem-detector/kubelet-user-standalone/npd-binding.yaml deleted file mode 100644 index 3d34fef427c..00000000000 --- a/cluster/addons/node-problem-detector/kubelet-user-standalone/npd-binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: kubelet-user-npd-binding - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:node-problem-detector -subjects: -- apiGroup: rbac.authorization.k8s.io - kind: User - name: kubelet diff --git a/cluster/addons/node-problem-detector/npd.yaml b/cluster/addons/node-problem-detector/npd.yaml deleted file mode 100644 index 838106be1bf..00000000000 --- a/cluster/addons/node-problem-detector/npd.yaml +++ /dev/null @@ -1,96 +0,0 @@ -apiVersion: v1 -kind: ServiceAccount -metadata: - name: node-problem-detector - namespace: kube-system - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile ---- -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: npd-binding - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:node-problem-detector -subjects: -- kind: ServiceAccount - name: node-problem-detector - namespace: kube-system ---- -apiVersion: apps/v1 -kind: DaemonSet -metadata: - name: node-problem-detector - namespace: kube-system - labels: - app.kubernetes.io/name: node-problem-detector - app.kubernetes.io/version: v1.34.0 - addonmanager.kubernetes.io/mode: Reconcile -spec: - selector: - matchLabels: - app.kubernetes.io/name: node-problem-detector - app.kubernetes.io/version: v1.34.0 - template: - metadata: - labels: - app.kubernetes.io/name: node-problem-detector - app.kubernetes.io/version: v1.34.0 - spec: - nodeSelectors: - kubernetes.io/os: linux - containers: - - name: node-problem-detector - image: registry.k8s.io/node-problem-detector/node-problem-detector:v1.34.0 - command: - - "/bin/sh" - - "-c" - - "exec /node-problem-detector --logtostderr --config.system-log-monitor=/config/kernel-monitor.json,/config/systemd-monitor.json --config.custom-plugin-monitor=/config/kernel-monitor-counter.json,/config/systemd-monitor-counter.json --config.system-stats-monitor=/config/system-stats-monitor.json >>/var/log/node-problem-detector.log 2>&1" - securityContext: - privileged: true - resources: - limits: - cpu: "200m" - memory: "100Mi" - requests: - cpu: "20m" - memory: "20Mi" - env: - - name: NODE_NAME - valueFrom: - fieldRef: - fieldPath: spec.nodeName - volumeMounts: - - name: log - mountPath: /var/log - - name: kmsg - mountPath: /dev/kmsg - readOnly: true - - name: localtime - mountPath: /etc/localtime - readOnly: true - volumes: - - name: log - hostPath: - path: /var/log/ - - name: kmsg - hostPath: - path: /dev/kmsg - - name: localtime - hostPath: - path: /etc/localtime - type: "FileOrCreate" - serviceAccountName: node-problem-detector - tolerations: - - operator: "Exists" - effect: "NoExecute" - - operator: "Exists" - effect: "NoSchedule" - - key: "CriticalAddonsOnly" - operator: "Exists" diff --git a/cluster/addons/node-problem-detector/standalone/npd-binding.yaml b/cluster/addons/node-problem-detector/standalone/npd-binding.yaml deleted file mode 100644 index d7d64a63684..00000000000 --- a/cluster/addons/node-problem-detector/standalone/npd-binding.yaml +++ /dev/null @@ -1,15 +0,0 @@ -apiVersion: rbac.authorization.k8s.io/v1 -kind: ClusterRoleBinding -metadata: - name: npd-binding - labels: - kubernetes.io/cluster-service: "true" - addonmanager.kubernetes.io/mode: Reconcile -roleRef: - apiGroup: rbac.authorization.k8s.io - kind: ClusterRole - name: system:node-problem-detector -subjects: -- apiGroup: rbac.authorization.k8s.io - kind: User - name: system:node-problem-detector diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index 505ef63ad34..14552a7ef65 100755 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -2966,16 +2966,6 @@ function start-kube-addons { update-event-exporter ${event_exporter_yaml} update-prometheus-to-sd-parameters ${event_exporter_yaml} fi - if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "daemonset" ]]; then - setup-addon-manifests "addons" "node-problem-detector" - fi - if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then - # Setup role binding(s) for standalone node problem detector. - if [[ -n "${NODE_PROBLEM_DETECTOR_TOKEN:-}" ]]; then - setup-addon-manifests "addons" "node-problem-detector/standalone" - fi - setup-addon-manifests "addons" "node-problem-detector/kubelet-user-standalone" "node-problem-detector" - fi if echo "${ADMISSION_CONTROL:-}" | grep -q "LimitRanger"; then setup-addon-manifests "admission-controls" "limit-range" "gce" fi diff --git a/cluster/gce/gci/configure.sh b/cluster/gce/gci/configure.sh index b61f85cdf72..451f62e31b4 100644 --- a/cluster/gce/gci/configure.sh +++ b/cluster/gce/gci/configure.sh @@ -27,9 +27,7 @@ set -o pipefail DEFAULT_CNI_VERSION='v1.9.1' # CNI HASH for amd64 sha512 DEFAULT_CNI_HASH='3ea8a76852b7ddc62c087a34cccca2cb29822ca24214928cd172b28bf9d1486000ba3eb71a156445af31ff6a92c1dc3e01e702546c6ee016ef13fae06ccfb8fc' -DEFAULT_NPD_VERSION='v1.34.0' -DEFAULT_NPD_HASH_AMD64='3c55ff6ffadd77dbc3df3774d13164587103ca87c8b6914f5c71c87d8f498b78621e0c96538bb3c69f8f1b4194a6da553aa56b1b52001a7d9a67776ac24e80bd' -DEFAULT_NPD_HASH_ARM64='ca1d34e64b80f6b2bdf86cfde95154122d6e14c707a748ea6fc414a55f391b1bb572a96b6b2c285996af0232917fa87e14e037125aa03a62247383af3e48c095' + DEFAULT_CRICTL_VERSION='v1.36.0' DEFAULT_CRICTL_AMD64_SHA512='43ac5425d264547bc9d9c9e31c74624d9c2a63bf7de4e77fe79517e0c927ea77ee3951a2f662920bc771599a0dc4f2859b6225c3621c7cafff952e63c83d686d' DEFAULT_CRICTL_ARM64_SHA512='485aa86f327c23cb0508e814e568bda793d291865c5cec3337ae5467a51898e9ab21a6bd38b73a6b219058bb34c9b4e7128e57360a2552b74a552e7ea1936f32' @@ -293,56 +291,6 @@ function install-gci-mounter-tools { mkdir -p "${CONTAINERIZED_MOUNTER_HOME}/rootfs/var/lib/kubelet" } -# Install node problem detector binary. -function install-node-problem-detector { - if [[ -n "${NODE_PROBLEM_DETECTOR_VERSION:-}" ]]; then - local -r npd_version="${NODE_PROBLEM_DETECTOR_VERSION}" - local -r npd_hash="${NODE_PROBLEM_DETECTOR_TAR_HASH}" - else - local -r npd_version="${DEFAULT_NPD_VERSION}" - case "${HOST_PLATFORM}/${HOST_ARCH}" in - linux/amd64) - local -r npd_hash="${DEFAULT_NPD_HASH_AMD64}" - ;; - linux/arm64) - local -r npd_hash="${DEFAULT_NPD_HASH_ARM64}" - ;; - # no other architectures are supported currently. - # Assumption is that this script only runs on linux, - # see cluster/gce/windows/k8s-node-setup.psm1 for windows - # https://github.com/kubernetes/node-problem-detector/releases/ - *) - echo "Unrecognized version and platform/arch combination:" - echo "$DEFAULT_NPD_VERSION $HOST_PLATFORM/$HOST_ARCH" - echo "Set NODE_PROBLEM_DETECTOR_VERSION and NODE_PROBLEM_DETECTOR_TAR_HASH to overwrite" - exit 1 - ;; - esac - fi - local -r npd_tar="node-problem-detector-${npd_version}-${HOST_PLATFORM}_${HOST_ARCH}.tar.gz" - - if is-preloaded "${npd_tar}" "${npd_hash}"; then - echo "${npd_tar} is preloaded." - return - fi - - if [[ -n "${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}" ]]; then - echo "Downloading ${npd_tar} from ${NODE_PROBLEM_DETECTOR_RELEASE_PATH}." - local -r download_path="${NODE_PROBLEM_DETECTOR_RELEASE_PATH}/node-problem-detector/${npd_tar}" - else - echo "Downloading ${npd_tar} from github." - local -r download_path="https://github.com/kubernetes/node-problem-detector/releases/download/${npd_version}/${npd_tar}" - fi - download-or-bust "${npd_hash}" "${download_path}" - local -r npd_dir="${KUBE_HOME}/node-problem-detector" - mkdir -p "${npd_dir}" - tar xzf "${KUBE_HOME}/${npd_tar}" -C "${npd_dir}" --overwrite - mv "${npd_dir}/bin"/* "${KUBE_BIN}" - chmod a+x "${KUBE_BIN}/node-problem-detector" - rmdir "${npd_dir}/bin" - rm -f "${KUBE_HOME}/${npd_tar}" -} - function install-cni-binaries { local -r cni_version=${CNI_VERSION:-$DEFAULT_CNI_VERSION} if [[ -n "${CNI_VERSION:-}" ]]; then @@ -795,11 +743,6 @@ function install-kube-binary-config { mv "${KUBE_HOME}/kubernetes/kubernetes-src.tar.gz" "${KUBE_HOME}" fi - if [[ "${KUBERNETES_MASTER:-}" == "false" ]] && \ - [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then - log-wrap "InstallNodeProblemDetector" install-node-problem-detector - fi - if [[ "${NETWORK_PROVIDER:-}" == "kubenet" ]] || \ [[ "${NETWORK_PROVIDER:-}" == "cni" ]]; then log-wrap "InstallCNIBinaries" install-cni-binaries diff --git a/test/e2e_node/image_list.go b/test/e2e_node/image_list.go index 91f13a18c5f..cc469df0046 100644 --- a/test/e2e_node/image_list.go +++ b/test/e2e_node/image_list.go @@ -92,7 +92,7 @@ func updateImageAllowList(ctx context.Context) { } func getNodeProblemDetectorImage() string { - const defaultImage string = "registry.k8s.io/node-problem-detector/node-problem-detector:v1.34.0" + const defaultImage string = "registry.k8s.io/node-problem-detector/node-problem-detector:v1.35.2" image := os.Getenv("NODE_PROBLEM_DETECTOR_IMAGE") if image == "" { image = defaultImage diff --git a/test/kubemark/resources/hollow-node_template.yaml b/test/kubemark/resources/hollow-node_template.yaml index 1a51acb7a61..a6cc19c889d 100644 --- a/test/kubemark/resources/hollow-node_template.yaml +++ b/test/kubemark/resources/hollow-node_template.yaml @@ -99,7 +99,7 @@ spec: cpu: {{hollow_proxy_millicpu}}m memory: {{hollow_proxy_mem_Ki}}Ki - name: hollow-node-problem-detector - image: registry.k8s.io/node-problem-detector/node-problem-detector:v1.34.0 + image: registry.k8s.io/node-problem-detector/node-problem-detector:v1.35.2 env: - name: NODE_NAME valueFrom: From 5e7ff896192dc9a33f99e84f28c0671dd7bc74fe Mon Sep 17 00:00:00 2001 From: Humble Devassy Chirammal Date: Thu, 11 Jun 2026 11:42:01 +0530 Subject: [PATCH 2/2] cluster: remove remaining NPD envvars, setup functions, and Windows support Remove the remaining node-problem-detector references across the cluster/ directory that were not covered by the initial cleanup: - Remove ENABLE_NODE_PROBLEM_DETECTOR, NODE_PROBLEM_DETECTOR_VERSION, NODE_PROBLEM_DETECTOR_TAR_HASH, NODE_PROBLEM_DETECTOR_RELEASE_PATH and NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS exports from config-default.sh and config-test.sh. - Remove WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR and WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS from config-default.sh and config-test.sh. - Remove WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE from config-common.sh. - Remove create-node-problem-detector-kubeconfig and create-node-problem-detector-kubeconfig-from-kubelet functions along with all NPD token/kubeconfig setup blocks from configure-helper.sh. - Remove NPD installation and configuration steps (install-npd, configure-npd, start-node-problem-detector) from upgrade.sh and util.sh. - Remove DownloadAndInstall-NodeProblemDetector, Create-NodeProblemDetectorKubeConfig and Configure-NodeProblemDetector from cluster/gce/windows/k8s-node-setup.psm1 and their call sites in configure.ps1. - Remove the node-problem-detector systemd source block from the fluentd configmaps and the node-problem-detector.log entry from log-dump.sh. - Remove npd and npdSA test variables and their use from audit_policy_test.go. Co-authored-by: Jordan Liggitt Signed-off-by: Humble Devassy Chirammal --- .../fluentd-gcp-configmap-old.yaml | 10 +- .../fluentd-gcp/fluentd-gcp-configmap.yaml | 10 +- cluster/gce/config-common.sh | 2 - cluster/gce/config-default.sh | 15 -- cluster/gce/config-test.sh | 15 -- cluster/gce/gci/audit_policy_test.go | 14 +- cluster/gce/gci/configure-helper.sh | 103 +----------- cluster/gce/upgrade.sh | 17 -- cluster/gce/util.sh | 17 -- cluster/gce/windows/configure.ps1 | 3 - cluster/gce/windows/k8s-node-setup.psm1 | 151 ------------------ cluster/log-dump/log-dump.sh | 4 +- 12 files changed, 11 insertions(+), 350 deletions(-) diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap-old.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap-old.yaml index bffda046ab6..611bd3ea8fc 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap-old.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap-old.yaml @@ -267,14 +267,6 @@ data: tag kubelet - - @type systemd - filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }] - pos_file /var/log/gcp-journald-node-problem-detector.pos - read_from_head true - tag node-problem-detector - - # BEGIN_NODE_JOURNAL # Whether to include node-journal or not is determined when starting the # cluster. It is not changed when the cluster is already running. @@ -289,7 +281,7 @@ data: @type grep key _SYSTEMD_UNIT - pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet|node-problem-detector)\.service$ + pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet)\.service$ # END_NODE_JOURNAL diff --git a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml index 549fef75a54..44292f1d5c7 100644 --- a/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml +++ b/cluster/addons/fluentd-gcp/fluentd-gcp-configmap.yaml @@ -282,14 +282,6 @@ data: tag kubelet - - @type systemd - filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }] - pos_file /var/log/gcp-journald-node-problem-detector.pos - read_from_head true - tag node-problem-detector - - # BEGIN_NODE_JOURNAL # Whether to include node-journal or not is determined when starting the # cluster. It is not changed when the cluster is already running. @@ -304,7 +296,7 @@ data: @type grep key _SYSTEMD_UNIT - pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet|node-problem-detector)\.service$ + pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet)\.service$ # END_NODE_JOURNAL diff --git a/cluster/gce/config-common.sh b/cluster/gce/config-common.sh index b1d715f3c26..c61a3f2fdc1 100644 --- a/cluster/gce/config-common.sh +++ b/cluster/gce/config-common.sh @@ -151,8 +151,6 @@ export WINDOWS_KUBECONFIG_FILE="${WINDOWS_K8S_DIR}\kubelet.kubeconfig" export WINDOWS_BOOTSTRAP_KUBECONFIG_FILE="${WINDOWS_K8S_DIR}\kubelet.bootstrap-kubeconfig" # Path for kube-proxy kubeconfig file on Windows nodes. export WINDOWS_KUBEPROXY_KUBECONFIG_FILE="${WINDOWS_K8S_DIR}\kubeproxy.kubeconfig" -# Path for kube-proxy kubeconfig file on Windows nodes. -export WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE="${WINDOWS_K8S_DIR}\node-problem-detector.kubeconfig" # Pause container image for Windows container. export WINDOWS_INFRA_CONTAINER="registry.k8s.io/pause:3.10.2" # Storage Path for csi-proxy. csi-proxy only needs to be installed for Windows. diff --git a/cluster/gce/config-default.sh b/cluster/gce/config-default.sh index c742e9b1671..3413397f7d4 100755 --- a/cluster/gce/config-default.sh +++ b/cluster/gce/config-default.sh @@ -289,16 +289,6 @@ export DNS_MEMORY_LIMIT="${KUBE_DNS_MEMORY_LIMIT:-170Mi}" # Optional: Enable DNS horizontal autoscaler export ENABLE_DNS_HORIZONTAL_AUTOSCALER="${KUBE_ENABLE_DNS_HORIZONTAL_AUTOSCALER:-true}" -# Optional: Install node problem detector. -# none - Not run node problem detector. -# daemonset - Run node problem detector as daemonset. -# standalone - Run node problem detector as standalone system daemon. -export ENABLE_NODE_PROBLEM_DETECTOR="${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-daemonset}" -NODE_PROBLEM_DETECTOR_VERSION="${NODE_PROBLEM_DETECTOR_VERSION:-}" -NODE_PROBLEM_DETECTOR_TAR_HASH="${NODE_PROBLEM_DETECTOR_TAR_HASH:-}" -NODE_PROBLEM_DETECTOR_RELEASE_PATH="${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}" -NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}" - CNI_HASH="${CNI_HASH:-}" CNI_TAR_PREFIX="${CNI_TAR_PREFIX:-cni-plugins-linux-amd64-}" CNI_STORAGE_URL_BASE="${CNI_STORAGE_URL_BASE:-https://github.com/containernetworking/plugins/releases/download}" @@ -526,11 +516,6 @@ export WINDOWS_ENABLE_PIGZ="${WINDOWS_ENABLE_PIGZ:-true}" # Enable Windows DSR (Direct Server Return) export WINDOWS_ENABLE_DSR="${WINDOWS_ENABLE_DSR:-false}" -# Install Node Problem Detector (NPD) on Windows nodes. -# NPD analyzes the host for problems that can disrupt workloads. -export WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR="${WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR:-none}" -export WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}" - # Enable Windows Hyper-V # sig-storage uses it to create Virtual Hard Disks in tests export WINDOWS_ENABLE_HYPERV="${WINDOWS_ENABLE_HYPERV:-false}" diff --git a/cluster/gce/config-test.sh b/cluster/gce/config-test.sh index f774f26a8fd..6ce5d6920c1 100755 --- a/cluster/gce/config-test.sh +++ b/cluster/gce/config-test.sh @@ -331,16 +331,6 @@ export DNS_MEMORY_LIMIT=${KUBE_DNS_MEMORY_LIMIT:-170Mi} # Optional: Enable DNS horizontal autoscaler export ENABLE_DNS_HORIZONTAL_AUTOSCALER=${KUBE_ENABLE_DNS_HORIZONTAL_AUTOSCALER:-true} -# Optional: Install node problem detector. -# none - Not run node problem detector. -# daemonset - Run node problem detector as daemonset. -# standalone - Run node problem detector as standalone system daemon. -export ENABLE_NODE_PROBLEM_DETECTOR=${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-daemonset} -NODE_PROBLEM_DETECTOR_VERSION=${NODE_PROBLEM_DETECTOR_VERSION:-} -NODE_PROBLEM_DETECTOR_TAR_HASH=${NODE_PROBLEM_DETECTOR_TAR_HASH:-} -NODE_PROBLEM_DETECTOR_RELEASE_PATH=${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-} -NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS=${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-} - CNI_HASH=${CNI_HASH:-} CNI_TAR_PREFIX=${CNI_TAR_PREFIX:-cni-plugins-linux-amd64-} CNI_STORAGE_URL_BASE=${CNI_STORAGE_URL_BASE:-https://github.com/containernetworking/plugins/releases/download} @@ -578,11 +568,6 @@ export WINDOWS_ENABLE_PIGZ="${WINDOWS_ENABLE_PIGZ:-true}" # Enable Windows DSR (Direct Server Return) export WINDOWS_ENABLE_DSR="${WINDOWS_ENABLE_DSR:-false}" -# Install Node Problem Detector (NPD) on Windows nodes. -# NPD analyzes the host for problems that can disrupt workloads. -export WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR="${WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR:-none}" -export WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}" - # TLS_CIPHER_SUITES defines cipher suites allowed to be used by kube-apiserver. # If this variable is unset or empty, kube-apiserver will allow its default set of cipher suites. export TLS_CIPHER_SUITES="" diff --git a/cluster/gce/gci/audit_policy_test.go b/cluster/gce/gci/audit_policy_test.go index bd2f528cc5a..06b895d988e 100644 --- a/cluster/gce/gci/audit_policy_test.go +++ b/cluster/gce/gci/audit_policy_test.go @@ -73,13 +73,11 @@ func TestCreateMasterAuditPolicy(t *testing.T) { scheduler = newUserInfo(user.KubeScheduler, user.AllAuthenticated) apiserver = newUserInfo(user.APIServerUser, user.SystemPrivilegedGroup) autoscaler = newUserInfo("cluster-autoscaler", user.AllAuthenticated) - npd = newUserInfo("system:node-problem-detector", user.AllAuthenticated) - npdSA = serviceaccount.UserInfo("kube-system", "node-problem-detector", "") namespaceController = serviceaccount.UserInfo("kube-system", "namespace-controller", "") endpointController = serviceaccount.UserInfo("kube-system", "endpoint-controller", "") defaultSA = serviceaccount.UserInfo("default", "default", "") - allUsers = []user.Info{anonymous, kubeproxy, ingress, kubelet, node, controller, scheduler, apiserver, autoscaler, npd, npdSA, namespaceController, endpointController, defaultSA} + allUsers = []user.Info{anonymous, kubeproxy, ingress, kubelet, node, controller, scheduler, apiserver, autoscaler, namespaceController, endpointController, defaultSA} ) // Resources for test cases @@ -149,15 +147,15 @@ func TestCreateMasterAuditPolicy(t *testing.T) { at.testResources(none, node, apiserver, defaultSA, anonymous, "get", "list", "create", "patch", "update", "delete", events) - at.testResources(request, kubelet, node, npd, npdSA, "update", "patch", nodeStatus, podStatus) + at.testResources(request, kubelet, node, "update", "patch", nodeStatus, podStatus) at.testResources(request, namespaceController, "deletecollection", pods, namespaces) - at.testResources(metadata, defaultSA, anonymous, npd, namespaceController, "get", "create", "update", secrets, configmaps, sysConfigmaps, tokenReviews) - at.testResources(request, defaultSA, anonymous, npd, namespaceController, "get", "list", "watch", sysEndpoints, podMetrics, pods, clusterRoles, deployments) - at.testResources(response, defaultSA, anonymous, npd, namespaceController, "create", "update", "patch", "delete", sysEndpoints, podMetrics, pods, clusterRoles, deployments) + at.testResources(metadata, defaultSA, anonymous, namespaceController, "get", "create", "update", secrets, configmaps, sysConfigmaps, tokenReviews) + at.testResources(request, defaultSA, anonymous, namespaceController, "get", "list", "watch", sysEndpoints, podMetrics, pods, clusterRoles, deployments) + at.testResources(response, defaultSA, anonymous, namespaceController, "create", "update", "patch", "delete", sysEndpoints, podMetrics, pods, clusterRoles, deployments) - at.testResources(metadata, defaultSA, anonymous, npd, namespaceController, "get", "list", "watch", "create", "update", "patch", "delete", foobars, foobarbaz) + at.testResources(metadata, defaultSA, anonymous, namespaceController, "get", "list", "watch", "create", "update", "patch", "delete", foobars, foobarbaz) } type auditTester struct { diff --git a/cluster/gce/gci/configure-helper.sh b/cluster/gce/gci/configure-helper.sh index 14552a7ef65..645b11264ec 100755 --- a/cluster/gce/gci/configure-helper.sh +++ b/cluster/gce/gci/configure-helper.sh @@ -845,9 +845,6 @@ function create-master-auth { if [[ -n "${KUBE_PROXY_TOKEN:-}" ]]; then append_or_replace_prefixed_line "${known_tokens_csv}" "${KUBE_PROXY_TOKEN}," "system:kube-proxy,uid:kube_proxy" fi - if [[ -n "${NODE_PROBLEM_DETECTOR_TOKEN:-}" ]]; then - append_or_replace_prefixed_line "${known_tokens_csv}" "${NODE_PROBLEM_DETECTOR_TOKEN}," "system:node-problem-detector,uid:node-problem-detector" - fi if [[ -n "${GCE_GLBC_TOKEN:-}" ]]; then append_or_replace_prefixed_line "${known_tokens_csv}" "${GCE_GLBC_TOKEN}," "system:controller:glbc,uid:system:controller:glbc" fi @@ -1239,7 +1236,7 @@ rules: # node and pod status calls from nodes are high-volume and can be large, don't log responses for expected updates from nodes - level: Request - users: ["kubelet", "system:node-problem-detector", "system:serviceaccount:kube-system:node-problem-detector"] + users: ["kubelet"] verbs: ["update","patch"] resources: - group: "" # core @@ -1438,41 +1435,6 @@ ${SCHEDULER_POLICY_CONFIG} EOF } -function create-node-problem-detector-kubeconfig { - local apiserver_address="${1}" - if [[ -z "${apiserver_address}" ]]; then - echo "Must provide API server address to create node-problem-detector kubeconfig file!" - exit 1 - fi - echo "Creating node-problem-detector kubeconfig file" - mkdir -p /var/lib/node-problem-detector - cat </var/lib/node-problem-detector/kubeconfig -apiVersion: v1 -kind: Config -users: -- name: node-problem-detector - user: - token: ${NODE_PROBLEM_DETECTOR_TOKEN} -clusters: -- name: local - cluster: - server: https://${apiserver_address} - certificate-authority-data: ${CA_CERT} -contexts: -- context: - cluster: local - user: node-problem-detector - name: service-account-context -current-context: service-account-context -EOF -} - -function create-node-problem-detector-kubeconfig-from-kubelet { - echo "Creating node-problem-detector kubeconfig from /var/lib/kubelet/kubeconfig" - mkdir -p /var/lib/node-problem-detector - cp /var/lib/kubelet/kubeconfig /var/lib/node-problem-detector/kubeconfig -} - function create-master-etcd-auth { if [[ -n "${ETCD_CA_CERT:-}" && -n "${ETCD_PEER_KEY:-}" && -n "${ETCD_PEER_CERT:-}" ]]; then local -r auth_dir="/etc/srv/kubernetes" @@ -1688,56 +1650,6 @@ EOF systemctl start kubelet.service } -# This function assembles the node problem detector systemd service file and -# starts it using systemctl. -function start-node-problem-detector { - echo "Start node problem detector" - local -r npd_bin="${KUBE_HOME}/bin/node-problem-detector" - echo "Using node problem detector binary at ${npd_bin}" - - local flags="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}" - if [[ -z "${flags}" ]]; then - local -r km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor.json" - # TODO(random-liu): Handle this for alternative container runtime. - local -r dm_config="${KUBE_HOME}/node-problem-detector/config/docker-monitor.json" - local -r sm_config="${KUBE_HOME}/node-problem-detector/config/systemd-monitor.json" - local -r ssm_config="${KUBE_HOME}/node-problem-detector/config/system-stats-monitor.json" - - local -r custom_km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor-counter.json" - local -r custom_sm_config="${KUBE_HOME}/node-problem-detector/config/systemd-monitor-counter.json" - - flags="${NPD_TEST_LOG_LEVEL:-"--v=2"} ${NPD_TEST_ARGS:-}" - flags+=" --logtostderr" - flags+=" --config.system-log-monitor=${km_config},${dm_config},${sm_config}" - flags+=" --config.system-stats-monitor=${ssm_config}" - flags+=" --config.custom-plugin-monitor=${custom_km_config},${custom_sm_config}" - local -r npd_port=${NODE_PROBLEM_DETECTOR_PORT:-20256} - flags+=" --port=${npd_port}" - if [[ -n "${EXTRA_NPD_ARGS:-}" ]]; then - flags+=" ${EXTRA_NPD_ARGS}" - fi - fi - flags+=" --apiserver-override=https://${KUBERNETES_MASTER_NAME}?inClusterConfig=false&auth=/var/lib/node-problem-detector/kubeconfig" - - # Write the systemd service file for node problem detector. - cat </etc/systemd/system/node-problem-detector.service -[Unit] -Description=Kubernetes node problem detector -Requires=network-online.target -After=network-online.target - -[Service] -Restart=always -RestartSec=10 -ExecStart=${npd_bin} ${flags} - -[Install] -WantedBy=multi-user.target -EOF - - systemctl start node-problem-detector.service -} - # Create the log file and set its properties. # # $1 is the file to create. @@ -3605,16 +3517,6 @@ function main() { log-wrap 'CreateNodePKI' create-node-pki log-wrap 'CreateKubeletKubeconfig' create-kubelet-kubeconfig "${KUBERNETES_MASTER_NAME}" log-wrap 'CreateKubeproxyUserKubeconfig' create-kubeproxy-user-kubeconfig - if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then - if [[ -n "${NODE_PROBLEM_DETECTOR_TOKEN:-}" ]]; then - log-wrap 'CreateNodeProblemDetectorKubeconfig' create-node-problem-detector-kubeconfig "${KUBERNETES_MASTER_NAME}" - elif [[ -f "/var/lib/kubelet/kubeconfig" ]]; then - log-wrap 'CreateNodeProblemDetectorKubeconfigFromKubelet' create-node-problem-detector-kubeconfig-from-kubelet - else - echo "Either NODE_PROBLEM_DETECTOR_TOKEN or /var/lib/kubelet/kubeconfig must be set" - exit 1 - fi - fi fi log-wrap 'DetectCgroupConfig' detect-cgroup-config @@ -3669,9 +3571,6 @@ function main() { log-wrap 'UpdateLegacyAddonNodeLabels' update-legacy-addon-node-labels & else log-wrap 'StartKubeProxy' start-kube-proxy - if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then - log-wrap 'StartNodeProblemDetector' start-node-problem-detector - fi fi log-wrap 'ResetMotd' reset-motd diff --git a/cluster/gce/upgrade.sh b/cluster/gce/upgrade.sh index 4641ed6b24d..27b19bd3a71 100755 --- a/cluster/gce/upgrade.sh +++ b/cluster/gce/upgrade.sh @@ -114,11 +114,6 @@ function upgrade-master() { function upgrade-master-env() { echo "== Upgrading master environment variables. ==" - # Generate the node problem detector token if it isn't present on the original - # master. - if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" && "${NODE_PROBLEM_DETECTOR_TOKEN:-}" == "" ]]; then - NODE_PROBLEM_DETECTOR_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null) - fi } function wait-for-master() { @@ -193,7 +188,6 @@ function get-node-os() { # # Vars set: # KUBE_PROXY_TOKEN -# NODE_PROBLEM_DETECTOR_TOKEN # CA_CERT_BASE64 # EXTRA_DOCKER_OPTS # KUBELET_CERT_BASE64 @@ -232,7 +226,6 @@ function setup-base-image() { # SANITIZED_VERSION # INSTANCE_GROUPS # KUBE_PROXY_TOKEN -# NODE_PROBLEM_DETECTOR_TOKEN # CA_CERT_BASE64 # EXTRA_DOCKER_OPTS # KUBELET_CERT_BASE64 @@ -257,8 +250,6 @@ function prepare-node-upgrade() { node_env=$(get-node-env) KUBE_PROXY_TOKEN=$(get-env-val "${node_env}" "KUBE_PROXY_TOKEN") export KUBE_PROXY_TOKEN - NODE_PROBLEM_DETECTOR_TOKEN=$(get-env-val "${node_env}" "NODE_PROBLEM_DETECTOR_TOKEN") - export NODE_PROBLEM_DETECTOR_TOKEN CA_CERT_BASE64=$(get-env-val "${node_env}" "CA_CERT") export CA_CERT_BASE64 EXTRA_DOCKER_OPTS=$(get-env-val "${node_env}" "EXTRA_DOCKER_OPTS") @@ -286,14 +277,6 @@ function prepare-node-upgrade() { function upgrade-node-env() { echo "== Upgrading node environment variables. ==" - # Get the node problem detector token from master if it isn't present on - # the original node. - if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" && "${NODE_PROBLEM_DETECTOR_TOKEN:-}" == "" ]]; then - detect-master - local master_env - master_env=$(get-master-env) - NODE_PROBLEM_DETECTOR_TOKEN=$(get-env-val "${master_env}" "NODE_PROBLEM_DETECTOR_TOKEN") - fi } # Upgrades a single node. diff --git a/cluster/gce/util.sh b/cluster/gce/util.sh index 5ae35b3009d..f183436e222 100755 --- a/cluster/gce/util.sh +++ b/cluster/gce/util.sh @@ -1128,11 +1128,6 @@ DOCKER_REGISTRY_MIRROR_URL: $(yaml-quote "${DOCKER_REGISTRY_MIRROR_URL:-}") ENABLE_L7_LOADBALANCING: $(yaml-quote "${ENABLE_L7_LOADBALANCING:-none}") ENABLE_CLUSTER_LOGGING: $(yaml-quote "${ENABLE_CLUSTER_LOGGING:-false}") ENABLE_AUTH_PROVIDER_GCP: $(yaml-quote "${ENABLE_AUTH_PROVIDER_GCP:-true}") -ENABLE_NODE_PROBLEM_DETECTOR: $(yaml-quote "${ENABLE_NODE_PROBLEM_DETECTOR:-none}") -NODE_PROBLEM_DETECTOR_VERSION: $(yaml-quote "${NODE_PROBLEM_DETECTOR_VERSION:-}") -NODE_PROBLEM_DETECTOR_TAR_HASH: $(yaml-quote "${NODE_PROBLEM_DETECTOR_TAR_HASH:-}") -NODE_PROBLEM_DETECTOR_RELEASE_PATH: $(yaml-quote "${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}") -NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS: $(yaml-quote "${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}") CNI_STORAGE_URL_BASE: $(yaml-quote "${CNI_STORAGE_URL_BASE:-}") CNI_TAR_PREFIX: $(yaml-quote "${CNI_TAR_PREFIX:-}") CNI_VERSION: $(yaml-quote "${CNI_VERSION:-}") @@ -1151,7 +1146,6 @@ ENABLE_DNS_HORIZONTAL_AUTOSCALER: $(yaml-quote "${ENABLE_DNS_HORIZONTAL_AUTOSCAL KUBE_PROXY_TOKEN: $(yaml-quote "${KUBE_PROXY_TOKEN:-}") KUBE_PROXY_MODE: $(yaml-quote "${KUBE_PROXY_MODE:-iptables}") DETECT_LOCAL_MODE: $(yaml-quote "${DETECT_LOCAL_MODE:-}") -NODE_PROBLEM_DETECTOR_TOKEN: $(yaml-quote "${NODE_PROBLEM_DETECTOR_TOKEN:-}") ADMISSION_CONTROL: $(yaml-quote "${ADMISSION_CONTROL:-}") MASTER_IP_RANGE: $(yaml-quote "${MASTER_IP_RANGE}") RUNTIME_CONFIG: $(yaml-quote "${RUNTIME_CONFIG}") @@ -1615,13 +1609,6 @@ WINDOWS_INFRA_CONTAINER: $(yaml-quote "${WINDOWS_INFRA_CONTAINER}") WINDOWS_ENABLE_PIGZ: $(yaml-quote "${WINDOWS_ENABLE_PIGZ}") WINDOWS_ENABLE_HYPERV: $(yaml-quote "${WINDOWS_ENABLE_HYPERV}") ENABLE_AUTH_PROVIDER_GCP: $(yaml-quote "${ENABLE_AUTH_PROVIDER_GCP}") -ENABLE_NODE_PROBLEM_DETECTOR: $(yaml-quote "${WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR}") -NODE_PROBLEM_DETECTOR_VERSION: $(yaml-quote "${NODE_PROBLEM_DETECTOR_VERSION}") -NODE_PROBLEM_DETECTOR_TAR_HASH: $(yaml-quote "${NODE_PROBLEM_DETECTOR_TAR_HASH}") -NODE_PROBLEM_DETECTOR_RELEASE_PATH: $(yaml-quote "${NODE_PROBLEM_DETECTOR_RELEASE_PATH}") -NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS: $(yaml-quote "${WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS}") -NODE_PROBLEM_DETECTOR_TOKEN: $(yaml-quote "${NODE_PROBLEM_DETECTOR_TOKEN:-}") -WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE: $(yaml-quote "${WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE}") AUTH_PROVIDER_GCP_STORAGE_PATH: $(yaml-quote "${AUTH_PROVIDER_GCP_STORAGE_PATH}") AUTH_PROVIDER_GCP_VERSION: $(yaml-quote "${AUTH_PROVIDER_GCP_VERSION}") AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64: $(yaml-quote "${AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64}") @@ -2080,7 +2067,6 @@ function parse-master-env() { local master_env master_env=$(get-master-env) KUBE_PROXY_TOKEN=$(get-env-val "${master_env}" "KUBE_PROXY_TOKEN") - NODE_PROBLEM_DETECTOR_TOKEN=$(get-env-val "${master_env}" "NODE_PROBLEM_DETECTOR_TOKEN") CA_CERT_BASE64=$(get-env-val "${master_env}" "CA_CERT") CA_KEY_BASE64=$(get-env-val "${master_env}" "CA_KEY") KUBEAPISERVER_CERT_BASE64=$(get-env-val "${master_env}" "KUBEAPISERVER_CERT") @@ -2903,9 +2889,6 @@ function create-master() { # computer) can forget it later. This should disappear with # http://issue.k8s.io/3168 KUBE_PROXY_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null) - if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then - NODE_PROBLEM_DETECTOR_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null) - fi # Reserve the master's IP so that it can later be transferred to another VM # without disrupting the kubelets. diff --git a/cluster/gce/windows/configure.ps1 b/cluster/gce/windows/configure.ps1 index adb5749635f..3d52bcd3af3 100644 --- a/cluster/gce/windows/configure.ps1 +++ b/cluster/gce/windows/configure.ps1 @@ -163,14 +163,12 @@ try { Configure-Crictl Setup-ContainerRuntime DownloadAndInstall-KubernetesBinaries - DownloadAndInstall-NodeProblemDetector DownloadAndInstall-CSIProxyBinaries DownloadAndInstall-AuthProviderGcpBinary Start-CSIProxy Create-NodePki Create-KubeletKubeconfig Create-KubeproxyKubeconfig - Create-NodeProblemDetectorKubeConfig Create-AuthProviderGcpConfig Set-PodCidr Configure-HostNetworkingService @@ -178,7 +176,6 @@ try { Configure-HostDnsConf Configure-GcePdTools Configure-Kubelet - Configure-NodeProblemDetector # Even if Logging agent is already installed, the function will still [re]start the service. if (IsLoggingEnabled $kube_env) { diff --git a/cluster/gce/windows/k8s-node-setup.psm1 b/cluster/gce/windows/k8s-node-setup.psm1 index efcfa47b204..42d830dd4d7 100644 --- a/cluster/gce/windows/k8s-node-setup.psm1 +++ b/cluster/gce/windows/k8s-node-setup.psm1 @@ -295,8 +295,6 @@ function Set-EnvironmentVars { "INFRA_CONTAINER" = ${kube_env}['WINDOWS_INFRA_CONTAINER'] "WINDOWS_ENABLE_PIGZ" = ${kube_env}['WINDOWS_ENABLE_PIGZ'] "WINDOWS_ENABLE_HYPERV" = ${kube_env}['WINDOWS_ENABLE_HYPERV'] - "ENABLE_NODE_PROBLEM_DETECTOR" = ${kube_env}['ENABLE_NODE_PROBLEM_DETECTOR'] - "NODEPROBLEMDETECTOR_KUBECONFIG_FILE" = ${kube_env}['WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE'] "ENABLE_AUTH_PROVIDER_GCP" = ${kube_env}['ENABLE_AUTH_PROVIDER_GCP'] "AUTH_PROVIDER_GCP_STORAGE_PATH" = ${kube_env}['AUTH_PROVIDER_GCP_STORAGE_PATH'] "AUTH_PROVIDER_GCP_VERSION" = ${kube_env}['AUTH_PROVIDER_GCP_VERSION'] @@ -1484,140 +1482,6 @@ function Install-Pigz { } } -# Node Problem Detector Resources -$NPD_SERVICE = "node-problem-detector" -$DEFAULT_NPD_VERSION = '0.8.10-gke0.1' -$DEFAULT_NPD_RELEASE_PATH = 'https://storage.googleapis.com/gke-release/winnode' -$DEFAULT_NPD_HASH = '97ddfe3544da9e02a1cfb55d24f329eb29d606fca7fbbf800415d5de9dbc29a00563f8e0d1919595c8e316fd989d45b09b13c07be528841fc5fd37e21d016a2d' - -# Install Node Problem Detector (NPD). -# NPD analyzes the host for problems that can disrupt workloads. -# https://github.com/kubernetes/node-problem-detector -function DownloadAndInstall-NodeProblemDetector { - if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone") { - if (ShouldWrite-File "${env:NODE_DIR}\node-problem-detector.exe") { - $npd_version = $DEFAULT_NPD_VERSION - $npd_hash = $DEFAULT_NPD_HASH - if (-not [string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_VERSION'])) { - $npd_version = ${kube_env}['NODE_PROBLEM_DETECTOR_VERSION'] - $npd_hash = ${kube_env}['NODE_PROBLEM_DETECTOR_TAR_HASH'] - } - $npd_release_path = $DEFAULT_NPD_RELEASE_PATH - if (-not [string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_RELEASE_PATH'])) { - $npd_release_path = ${kube_env}['NODE_PROBLEM_DETECTOR_RELEASE_PATH'] - } - - $npd_tar = "node-problem-detector-v${npd_version}-windows_amd64.tar.gz" - - Log-Output "Downloading ${npd_tar}." - - $npd_dir = "${env:K8S_DIR}\node-problem-detector" - New-Item -Path $npd_dir -ItemType Directory -Force -Confirm:$false - - MustDownload-File ` - -URLs "${npd_release_path}/node-problem-detector/${npd_tar}" ` - -Hash $npd_hash ` - -Algorithm SHA512 ` - -OutFile "${npd_dir}\${npd_tar}" - - tar xzvf "${npd_dir}\${npd_tar}" -C $npd_dir - Move-Item "${npd_dir}\bin\*" "${env:NODE_DIR}\" -Force -Confirm:$false - Remove-Item "${npd_dir}\bin" -Force -Confirm:$false - Remove-Item "${npd_dir}\${npd_tar}" -Force -Confirm:$false - } - else { - Log-Output "Node Problem Detector already installed." - } - } -} - -# Creates the node-problem-detector user kubeconfig file at -# $env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE (if defined). -# -# Create-NodePki() must be called first. -# -# Required ${kube_env} keys: -# CA_CERT -# NODE_PROBLEM_DETECTOR_TOKEN -function Create-NodeProblemDetectorKubeConfig { - if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone") { - if (-not [string]::IsNullOrEmpty(${kube_env]['NODE_PROBLEM_DETECTOR_TOKEN']})) { - Log-Output "Create-NodeProblemDetectorKubeConfig using Node Problem Detector token" - Create-Kubeconfig -Name 'node-problem-detector' ` - -Path ${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE} ` - -Token ${kube_env}['NODE_PROBLEM_DETECTOR_TOKEN'] - } elseif (Test-Path ${env:BOOTSTRAP_KUBECONFIG}) { - Log-Output "Create-NodeProblemDetectorKubeConfig creating kubeconfig from kubelet kubeconfig" - Copy-Item ${env:BOOTSTRAP_KUBECONFIG} -Destination ${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE} - Log-Output ("node-problem-detector bootstrap kubeconfig:`n" + - "$(Get-Content -Raw ${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE})") - } else { - Log-Output "Either NODE_PROBLEM_DETECTOR_TOKEN or ${env:BOOTSTRAP_KUBECONFIG} must be set" - exit 1 - } - } -} - -# Configures NPD to run with the bundled monitor configs and report against the Kubernetes api server. -function Configure-NodeProblemDetector { - $npd_bin = "${env:NODE_DIR}\node-problem-detector.exe" - if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone" -and (Test-Path $npd_bin)) { - $npd_svc = Get-Service -Name $NPD_SERVICE -ErrorAction SilentlyContinue - if ($npd_svc -eq $null) { - $npd_dir = "${env:K8S_DIR}\node-problem-detector" - $npd_logs_dir = "${env:LOGS_DIR}\node-problem-detector" - - New-Item -Path $npd_logs_dir -Type Directory -Force -Confirm:$false - - $flags = '' - if ([string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS'])) { - $system_log_monitors = @() - $system_stats_monitors = @() - $custom_plugin_monitors = @() - - # Custom Plugin Monitors - $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-kubelet.json") - $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-kubeproxy.json") - $custom_plugin_monitors += @("${npd_dir}\config\windows-defender-monitor.json") - - # System Stats Monitors - $system_stats_monitors += @("${npd_dir}\config\windows-system-stats-monitor.json") - - # NPD Configuration for CRI monitor - $system_log_monitors += @("${npd_dir}\config\windows-containerd-monitor-filelog.json") - $custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-containerd.json") - - $flags="--v=2 --port=20256 --log_dir=${npd_logs_dir}" - if ($system_log_monitors.count -gt 0) { - $flags+=" --config.system-log-monitor={0}" -f ($system_log_monitors -join ",") - } - if ($system_stats_monitors.count -gt 0) { - $flags+=" --config.system-stats-monitor={0}" -f ($system_stats_monitors -join ",") - } - if ($custom_plugin_monitors.count -gt 0) { - $flags+=" --config.custom-plugin-monitor={0}" -f ($custom_plugin_monitors -join ",") - } - } - else { - $flags = ${kube_env}['NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS'] - } - $kubernetes_master_name = ${kube_env}['KUBERNETES_MASTER_NAME'] - $flags = "${flags} --apiserver-override=`"https://${kubernetes_master_name}?inClusterConfig=false&auth=${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE}`"" - - Log-Output "Creating service: ${NPD_SERVICE}" - Log-Output "${npd_bin} ${flags}" - sc.exe create $NPD_SERVICE binpath= "${npd_bin} ${flags}" displayName= "Node Problem Detector" - sc.exe failure $NPD_SERVICE reset= 30 actions= restart/5000 - sc.exe start $NPD_SERVICE - - Write-VerboseServiceInfoToConsole -Service $NPD_SERVICE - } - else { - Log-Output "${NPD_SERVICE} already configured." - } - } -} - # TODO(pjh): move the logging agent code below into a separate # module; it was put here temporarily to avoid disrupting the file layout in # the K8s release machinery. @@ -1872,21 +1736,6 @@ $FLUENTBIT_CONFIG = @' Parser docker Parser containerd -# Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg -# Example: -# I0716 02:08:55.559351 3356 log_spam.go:42] Command line arguments: -[INPUT] - Name tail - Alias node-problem-detector - Tag node-problem-detector - Mem_Buf_Limit 5MB - Skip_Long_Lines On - Refresh_Interval 5 - Path C:\etc\kubernetes\logs\node-problem-detector\*.log.INFO* - DB /var/run/google-fluentbit/pos-files/node-problem-detector.db - Multiline On - Parser_Firstline glog - # Example: # I0928 03:15:50.440223 4880 main.go:51] Starting CSI-Proxy Server ... [INPUT] diff --git a/cluster/log-dump/log-dump.sh b/cluster/log-dump/log-dump.sh index b7505099f0e..f73a9409e0d 100755 --- a/cluster/log-dump/log-dump.sh +++ b/cluster/log-dump/log-dump.sh @@ -42,8 +42,8 @@ readonly node_ssh_supported_providers="gce gke aws" readonly gcloud_supported_providers="gce gke" readonly master_logfiles="kube-apiserver.log kube-apiserver-audit.log kube-scheduler.log kube-controller-manager.log cloud-controller-manager.log etcd.log etcd-events.log glbc.log cluster-autoscaler.log kube-addon-manager.log konnectivity-server.log fluentd.log kubelet.cov" -readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log node-problem-detector.log kubelet.cov kube-network-policies.log" -readonly node_systemd_services="node-problem-detector" +readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log kubelet.cov kube-network-policies.log" +readonly node_systemd_services="" readonly hollow_node_logfiles="kubelet-hollow-node-*.log kubeproxy-hollow-node-*.log npd-hollow-node-*.log" readonly aws_logfiles="cloud-init-output.log" readonly gce_logfiles="startupscript.log"