mirror of
https://github.com/kubernetes/kubernetes.git
synced 2026-06-13 10:50:56 -04:00
Merge pull request #137278 from humblec/update-npd-v1.35.2
Update node-problem-detector to v1.35.2 and remove addon manifests
This commit is contained in:
commit
79751b17da
22 changed files with 15 additions and 577 deletions
|
|
@ -109,23 +109,12 @@ dependencies:
|
|||
match: registry.k8s.io/e2e-test-images/agnhost:\d+\.\d+\.\d+
|
||||
|
||||
- name: "node-problem-detector"
|
||||
version: 1.34.0
|
||||
version: 1.35.2
|
||||
refPaths:
|
||||
- path: test/e2e_node/image_list.go
|
||||
match: const defaultImage
|
||||
- path: test/kubemark/resources/hollow-node_template.yaml
|
||||
match: registry.k8s.io/node-problem-detector/node-problem-detector
|
||||
- path: cluster/addons/node-problem-detector/npd.yaml
|
||||
match: registry.k8s.io/node-problem-detector/node-problem-detector
|
||||
- path: cluster/addons/node-problem-detector/npd.yaml
|
||||
match: app.kubernetes.io/version
|
||||
# TODO(dims): Ensure newer versions get uploaded to
|
||||
# - https://console.cloud.google.com/storage/browser/gke-release/winnode/node-problem-detector
|
||||
# - https://gcsweb.k8s.io/gcs/kubernetes-release/node-problem-detector/
|
||||
# and then the following references get fixed.
|
||||
#
|
||||
- path: cluster/gce/gci/configure.sh
|
||||
match: DEFAULT_NPD_VERSION=
|
||||
#- path: cluster/gce/windows/k8s-node-setup.psm1
|
||||
# match: DEFAULT_NPD_VERSION
|
||||
|
||||
|
|
|
|||
|
|
@ -267,14 +267,6 @@ data:
|
|||
tag kubelet
|
||||
</source>
|
||||
|
||||
<source>
|
||||
@type systemd
|
||||
filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }]
|
||||
pos_file /var/log/gcp-journald-node-problem-detector.pos
|
||||
read_from_head true
|
||||
tag node-problem-detector
|
||||
</source>
|
||||
|
||||
# BEGIN_NODE_JOURNAL
|
||||
# Whether to include node-journal or not is determined when starting the
|
||||
# cluster. It is not changed when the cluster is already running.
|
||||
|
|
@ -289,7 +281,7 @@ data:
|
|||
@type grep
|
||||
<exclude>
|
||||
key _SYSTEMD_UNIT
|
||||
pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet|node-problem-detector)\.service$
|
||||
pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet)\.service$
|
||||
</exclude>
|
||||
</filter>
|
||||
# END_NODE_JOURNAL
|
||||
|
|
|
|||
|
|
@ -282,14 +282,6 @@ data:
|
|||
tag kubelet
|
||||
</source>
|
||||
|
||||
<source>
|
||||
@type systemd
|
||||
filters [{ "_SYSTEMD_UNIT": "node-problem-detector.service" }]
|
||||
pos_file /var/log/gcp-journald-node-problem-detector.pos
|
||||
read_from_head true
|
||||
tag node-problem-detector
|
||||
</source>
|
||||
|
||||
# BEGIN_NODE_JOURNAL
|
||||
# Whether to include node-journal or not is determined when starting the
|
||||
# cluster. It is not changed when the cluster is already running.
|
||||
|
|
@ -304,7 +296,7 @@ data:
|
|||
@type grep
|
||||
<exclude>
|
||||
key _SYSTEMD_UNIT
|
||||
pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet|node-problem-detector)\.service$
|
||||
pattern ^(docker|{{ fluentd_container_runtime_service }}|kubelet)\.service$
|
||||
</exclude>
|
||||
</filter>
|
||||
# END_NODE_JOURNAL
|
||||
|
|
|
|||
|
|
@ -1,4 +0,0 @@
|
|||
# Maintainers
|
||||
|
||||
Random-Liu <lantaol@google.com>
|
||||
wangzhen127 <zhenw@google.com>
|
||||
|
|
@ -1,8 +0,0 @@
|
|||
# See the OWNERS docs at https://go.k8s.io/owners
|
||||
|
||||
approvers:
|
||||
- Random-Liu
|
||||
- wangzhen127
|
||||
reviewers:
|
||||
- Random-Liu
|
||||
- wangzhen127
|
||||
|
|
@ -1,7 +0,0 @@
|
|||
# Node Problem Detector
|
||||
==============
|
||||
|
||||
Node Problem Detector is a DaemonSet running on each node, detecting node
|
||||
problems.
|
||||
|
||||
Learn more at: https://github.com/kubernetes/node-problem-detector
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: kubelet-user-npd-binding
|
||||
labels:
|
||||
kubernetes.io/cluster-service: "true"
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:node-problem-detector
|
||||
subjects:
|
||||
- apiGroup: rbac.authorization.k8s.io
|
||||
kind: User
|
||||
name: kubelet
|
||||
|
|
@ -1,96 +0,0 @@
|
|||
apiVersion: v1
|
||||
kind: ServiceAccount
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
labels:
|
||||
kubernetes.io/cluster-service: "true"
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
---
|
||||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: npd-binding
|
||||
labels:
|
||||
kubernetes.io/cluster-service: "true"
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:node-problem-detector
|
||||
subjects:
|
||||
- kind: ServiceAccount
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
---
|
||||
apiVersion: apps/v1
|
||||
kind: DaemonSet
|
||||
metadata:
|
||||
name: node-problem-detector
|
||||
namespace: kube-system
|
||||
labels:
|
||||
app.kubernetes.io/name: node-problem-detector
|
||||
app.kubernetes.io/version: v1.34.0
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
spec:
|
||||
selector:
|
||||
matchLabels:
|
||||
app.kubernetes.io/name: node-problem-detector
|
||||
app.kubernetes.io/version: v1.34.0
|
||||
template:
|
||||
metadata:
|
||||
labels:
|
||||
app.kubernetes.io/name: node-problem-detector
|
||||
app.kubernetes.io/version: v1.34.0
|
||||
spec:
|
||||
nodeSelectors:
|
||||
kubernetes.io/os: linux
|
||||
containers:
|
||||
- name: node-problem-detector
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v1.34.0
|
||||
command:
|
||||
- "/bin/sh"
|
||||
- "-c"
|
||||
- "exec /node-problem-detector --logtostderr --config.system-log-monitor=/config/kernel-monitor.json,/config/systemd-monitor.json --config.custom-plugin-monitor=/config/kernel-monitor-counter.json,/config/systemd-monitor-counter.json --config.system-stats-monitor=/config/system-stats-monitor.json >>/var/log/node-problem-detector.log 2>&1"
|
||||
securityContext:
|
||||
privileged: true
|
||||
resources:
|
||||
limits:
|
||||
cpu: "200m"
|
||||
memory: "100Mi"
|
||||
requests:
|
||||
cpu: "20m"
|
||||
memory: "20Mi"
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
fieldRef:
|
||||
fieldPath: spec.nodeName
|
||||
volumeMounts:
|
||||
- name: log
|
||||
mountPath: /var/log
|
||||
- name: kmsg
|
||||
mountPath: /dev/kmsg
|
||||
readOnly: true
|
||||
- name: localtime
|
||||
mountPath: /etc/localtime
|
||||
readOnly: true
|
||||
volumes:
|
||||
- name: log
|
||||
hostPath:
|
||||
path: /var/log/
|
||||
- name: kmsg
|
||||
hostPath:
|
||||
path: /dev/kmsg
|
||||
- name: localtime
|
||||
hostPath:
|
||||
path: /etc/localtime
|
||||
type: "FileOrCreate"
|
||||
serviceAccountName: node-problem-detector
|
||||
tolerations:
|
||||
- operator: "Exists"
|
||||
effect: "NoExecute"
|
||||
- operator: "Exists"
|
||||
effect: "NoSchedule"
|
||||
- key: "CriticalAddonsOnly"
|
||||
operator: "Exists"
|
||||
|
|
@ -1,15 +0,0 @@
|
|||
apiVersion: rbac.authorization.k8s.io/v1
|
||||
kind: ClusterRoleBinding
|
||||
metadata:
|
||||
name: npd-binding
|
||||
labels:
|
||||
kubernetes.io/cluster-service: "true"
|
||||
addonmanager.kubernetes.io/mode: Reconcile
|
||||
roleRef:
|
||||
apiGroup: rbac.authorization.k8s.io
|
||||
kind: ClusterRole
|
||||
name: system:node-problem-detector
|
||||
subjects:
|
||||
- apiGroup: rbac.authorization.k8s.io
|
||||
kind: User
|
||||
name: system:node-problem-detector
|
||||
|
|
@ -151,8 +151,6 @@ export WINDOWS_KUBECONFIG_FILE="${WINDOWS_K8S_DIR}\kubelet.kubeconfig"
|
|||
export WINDOWS_BOOTSTRAP_KUBECONFIG_FILE="${WINDOWS_K8S_DIR}\kubelet.bootstrap-kubeconfig"
|
||||
# Path for kube-proxy kubeconfig file on Windows nodes.
|
||||
export WINDOWS_KUBEPROXY_KUBECONFIG_FILE="${WINDOWS_K8S_DIR}\kubeproxy.kubeconfig"
|
||||
# Path for kube-proxy kubeconfig file on Windows nodes.
|
||||
export WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE="${WINDOWS_K8S_DIR}\node-problem-detector.kubeconfig"
|
||||
# Pause container image for Windows container.
|
||||
export WINDOWS_INFRA_CONTAINER="registry.k8s.io/pause:3.10.2"
|
||||
# Storage Path for csi-proxy. csi-proxy only needs to be installed for Windows.
|
||||
|
|
|
|||
|
|
@ -289,16 +289,6 @@ export DNS_MEMORY_LIMIT="${KUBE_DNS_MEMORY_LIMIT:-170Mi}"
|
|||
# Optional: Enable DNS horizontal autoscaler
|
||||
export ENABLE_DNS_HORIZONTAL_AUTOSCALER="${KUBE_ENABLE_DNS_HORIZONTAL_AUTOSCALER:-true}"
|
||||
|
||||
# Optional: Install node problem detector.
|
||||
# none - Not run node problem detector.
|
||||
# daemonset - Run node problem detector as daemonset.
|
||||
# standalone - Run node problem detector as standalone system daemon.
|
||||
export ENABLE_NODE_PROBLEM_DETECTOR="${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-daemonset}"
|
||||
NODE_PROBLEM_DETECTOR_VERSION="${NODE_PROBLEM_DETECTOR_VERSION:-}"
|
||||
NODE_PROBLEM_DETECTOR_TAR_HASH="${NODE_PROBLEM_DETECTOR_TAR_HASH:-}"
|
||||
NODE_PROBLEM_DETECTOR_RELEASE_PATH="${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}"
|
||||
NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
|
||||
|
||||
CNI_HASH="${CNI_HASH:-}"
|
||||
CNI_TAR_PREFIX="${CNI_TAR_PREFIX:-cni-plugins-linux-amd64-}"
|
||||
CNI_STORAGE_URL_BASE="${CNI_STORAGE_URL_BASE:-https://github.com/containernetworking/plugins/releases/download}"
|
||||
|
|
@ -526,11 +516,6 @@ export WINDOWS_ENABLE_PIGZ="${WINDOWS_ENABLE_PIGZ:-true}"
|
|||
# Enable Windows DSR (Direct Server Return)
|
||||
export WINDOWS_ENABLE_DSR="${WINDOWS_ENABLE_DSR:-false}"
|
||||
|
||||
# Install Node Problem Detector (NPD) on Windows nodes.
|
||||
# NPD analyzes the host for problems that can disrupt workloads.
|
||||
export WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR="${WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR:-none}"
|
||||
export WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
|
||||
|
||||
# Enable Windows Hyper-V
|
||||
# sig-storage uses it to create Virtual Hard Disks in tests
|
||||
export WINDOWS_ENABLE_HYPERV="${WINDOWS_ENABLE_HYPERV:-false}"
|
||||
|
|
|
|||
|
|
@ -331,16 +331,6 @@ export DNS_MEMORY_LIMIT=${KUBE_DNS_MEMORY_LIMIT:-170Mi}
|
|||
# Optional: Enable DNS horizontal autoscaler
|
||||
export ENABLE_DNS_HORIZONTAL_AUTOSCALER=${KUBE_ENABLE_DNS_HORIZONTAL_AUTOSCALER:-true}
|
||||
|
||||
# Optional: Install node problem detector.
|
||||
# none - Not run node problem detector.
|
||||
# daemonset - Run node problem detector as daemonset.
|
||||
# standalone - Run node problem detector as standalone system daemon.
|
||||
export ENABLE_NODE_PROBLEM_DETECTOR=${KUBE_ENABLE_NODE_PROBLEM_DETECTOR:-daemonset}
|
||||
NODE_PROBLEM_DETECTOR_VERSION=${NODE_PROBLEM_DETECTOR_VERSION:-}
|
||||
NODE_PROBLEM_DETECTOR_TAR_HASH=${NODE_PROBLEM_DETECTOR_TAR_HASH:-}
|
||||
NODE_PROBLEM_DETECTOR_RELEASE_PATH=${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}
|
||||
NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS=${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}
|
||||
|
||||
CNI_HASH=${CNI_HASH:-}
|
||||
CNI_TAR_PREFIX=${CNI_TAR_PREFIX:-cni-plugins-linux-amd64-}
|
||||
CNI_STORAGE_URL_BASE=${CNI_STORAGE_URL_BASE:-https://github.com/containernetworking/plugins/releases/download}
|
||||
|
|
@ -578,11 +568,6 @@ export WINDOWS_ENABLE_PIGZ="${WINDOWS_ENABLE_PIGZ:-true}"
|
|||
# Enable Windows DSR (Direct Server Return)
|
||||
export WINDOWS_ENABLE_DSR="${WINDOWS_ENABLE_DSR:-false}"
|
||||
|
||||
# Install Node Problem Detector (NPD) on Windows nodes.
|
||||
# NPD analyzes the host for problems that can disrupt workloads.
|
||||
export WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR="${WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR:-none}"
|
||||
export WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS="${WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
|
||||
|
||||
# TLS_CIPHER_SUITES defines cipher suites allowed to be used by kube-apiserver.
|
||||
# If this variable is unset or empty, kube-apiserver will allow its default set of cipher suites.
|
||||
export TLS_CIPHER_SUITES=""
|
||||
|
|
|
|||
|
|
@ -73,13 +73,11 @@ func TestCreateMasterAuditPolicy(t *testing.T) {
|
|||
scheduler = newUserInfo(user.KubeScheduler, user.AllAuthenticated)
|
||||
apiserver = newUserInfo(user.APIServerUser, user.SystemPrivilegedGroup)
|
||||
autoscaler = newUserInfo("cluster-autoscaler", user.AllAuthenticated)
|
||||
npd = newUserInfo("system:node-problem-detector", user.AllAuthenticated)
|
||||
npdSA = serviceaccount.UserInfo("kube-system", "node-problem-detector", "")
|
||||
namespaceController = serviceaccount.UserInfo("kube-system", "namespace-controller", "")
|
||||
endpointController = serviceaccount.UserInfo("kube-system", "endpoint-controller", "")
|
||||
defaultSA = serviceaccount.UserInfo("default", "default", "")
|
||||
|
||||
allUsers = []user.Info{anonymous, kubeproxy, ingress, kubelet, node, controller, scheduler, apiserver, autoscaler, npd, npdSA, namespaceController, endpointController, defaultSA}
|
||||
allUsers = []user.Info{anonymous, kubeproxy, ingress, kubelet, node, controller, scheduler, apiserver, autoscaler, namespaceController, endpointController, defaultSA}
|
||||
)
|
||||
|
||||
// Resources for test cases
|
||||
|
|
@ -149,15 +147,15 @@ func TestCreateMasterAuditPolicy(t *testing.T) {
|
|||
|
||||
at.testResources(none, node, apiserver, defaultSA, anonymous, "get", "list", "create", "patch", "update", "delete", events)
|
||||
|
||||
at.testResources(request, kubelet, node, npd, npdSA, "update", "patch", nodeStatus, podStatus)
|
||||
at.testResources(request, kubelet, node, "update", "patch", nodeStatus, podStatus)
|
||||
|
||||
at.testResources(request, namespaceController, "deletecollection", pods, namespaces)
|
||||
|
||||
at.testResources(metadata, defaultSA, anonymous, npd, namespaceController, "get", "create", "update", secrets, configmaps, sysConfigmaps, tokenReviews)
|
||||
at.testResources(request, defaultSA, anonymous, npd, namespaceController, "get", "list", "watch", sysEndpoints, podMetrics, pods, clusterRoles, deployments)
|
||||
at.testResources(response, defaultSA, anonymous, npd, namespaceController, "create", "update", "patch", "delete", sysEndpoints, podMetrics, pods, clusterRoles, deployments)
|
||||
at.testResources(metadata, defaultSA, anonymous, namespaceController, "get", "create", "update", secrets, configmaps, sysConfigmaps, tokenReviews)
|
||||
at.testResources(request, defaultSA, anonymous, namespaceController, "get", "list", "watch", sysEndpoints, podMetrics, pods, clusterRoles, deployments)
|
||||
at.testResources(response, defaultSA, anonymous, namespaceController, "create", "update", "patch", "delete", sysEndpoints, podMetrics, pods, clusterRoles, deployments)
|
||||
|
||||
at.testResources(metadata, defaultSA, anonymous, npd, namespaceController, "get", "list", "watch", "create", "update", "patch", "delete", foobars, foobarbaz)
|
||||
at.testResources(metadata, defaultSA, anonymous, namespaceController, "get", "list", "watch", "create", "update", "patch", "delete", foobars, foobarbaz)
|
||||
}
|
||||
|
||||
type auditTester struct {
|
||||
|
|
|
|||
|
|
@ -845,9 +845,6 @@ function create-master-auth {
|
|||
if [[ -n "${KUBE_PROXY_TOKEN:-}" ]]; then
|
||||
append_or_replace_prefixed_line "${known_tokens_csv}" "${KUBE_PROXY_TOKEN}," "system:kube-proxy,uid:kube_proxy"
|
||||
fi
|
||||
if [[ -n "${NODE_PROBLEM_DETECTOR_TOKEN:-}" ]]; then
|
||||
append_or_replace_prefixed_line "${known_tokens_csv}" "${NODE_PROBLEM_DETECTOR_TOKEN}," "system:node-problem-detector,uid:node-problem-detector"
|
||||
fi
|
||||
if [[ -n "${GCE_GLBC_TOKEN:-}" ]]; then
|
||||
append_or_replace_prefixed_line "${known_tokens_csv}" "${GCE_GLBC_TOKEN}," "system:controller:glbc,uid:system:controller:glbc"
|
||||
fi
|
||||
|
|
@ -1239,7 +1236,7 @@ rules:
|
|||
|
||||
# node and pod status calls from nodes are high-volume and can be large, don't log responses for expected updates from nodes
|
||||
- level: Request
|
||||
users: ["kubelet", "system:node-problem-detector", "system:serviceaccount:kube-system:node-problem-detector"]
|
||||
users: ["kubelet"]
|
||||
verbs: ["update","patch"]
|
||||
resources:
|
||||
- group: "" # core
|
||||
|
|
@ -1438,41 +1435,6 @@ ${SCHEDULER_POLICY_CONFIG}
|
|||
EOF
|
||||
}
|
||||
|
||||
function create-node-problem-detector-kubeconfig {
|
||||
local apiserver_address="${1}"
|
||||
if [[ -z "${apiserver_address}" ]]; then
|
||||
echo "Must provide API server address to create node-problem-detector kubeconfig file!"
|
||||
exit 1
|
||||
fi
|
||||
echo "Creating node-problem-detector kubeconfig file"
|
||||
mkdir -p /var/lib/node-problem-detector
|
||||
cat <<EOF >/var/lib/node-problem-detector/kubeconfig
|
||||
apiVersion: v1
|
||||
kind: Config
|
||||
users:
|
||||
- name: node-problem-detector
|
||||
user:
|
||||
token: ${NODE_PROBLEM_DETECTOR_TOKEN}
|
||||
clusters:
|
||||
- name: local
|
||||
cluster:
|
||||
server: https://${apiserver_address}
|
||||
certificate-authority-data: ${CA_CERT}
|
||||
contexts:
|
||||
- context:
|
||||
cluster: local
|
||||
user: node-problem-detector
|
||||
name: service-account-context
|
||||
current-context: service-account-context
|
||||
EOF
|
||||
}
|
||||
|
||||
function create-node-problem-detector-kubeconfig-from-kubelet {
|
||||
echo "Creating node-problem-detector kubeconfig from /var/lib/kubelet/kubeconfig"
|
||||
mkdir -p /var/lib/node-problem-detector
|
||||
cp /var/lib/kubelet/kubeconfig /var/lib/node-problem-detector/kubeconfig
|
||||
}
|
||||
|
||||
function create-master-etcd-auth {
|
||||
if [[ -n "${ETCD_CA_CERT:-}" && -n "${ETCD_PEER_KEY:-}" && -n "${ETCD_PEER_CERT:-}" ]]; then
|
||||
local -r auth_dir="/etc/srv/kubernetes"
|
||||
|
|
@ -1688,56 +1650,6 @@ EOF
|
|||
systemctl start kubelet.service
|
||||
}
|
||||
|
||||
# This function assembles the node problem detector systemd service file and
|
||||
# starts it using systemctl.
|
||||
function start-node-problem-detector {
|
||||
echo "Start node problem detector"
|
||||
local -r npd_bin="${KUBE_HOME}/bin/node-problem-detector"
|
||||
echo "Using node problem detector binary at ${npd_bin}"
|
||||
|
||||
local flags="${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}"
|
||||
if [[ -z "${flags}" ]]; then
|
||||
local -r km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor.json"
|
||||
# TODO(random-liu): Handle this for alternative container runtime.
|
||||
local -r dm_config="${KUBE_HOME}/node-problem-detector/config/docker-monitor.json"
|
||||
local -r sm_config="${KUBE_HOME}/node-problem-detector/config/systemd-monitor.json"
|
||||
local -r ssm_config="${KUBE_HOME}/node-problem-detector/config/system-stats-monitor.json"
|
||||
|
||||
local -r custom_km_config="${KUBE_HOME}/node-problem-detector/config/kernel-monitor-counter.json"
|
||||
local -r custom_sm_config="${KUBE_HOME}/node-problem-detector/config/systemd-monitor-counter.json"
|
||||
|
||||
flags="${NPD_TEST_LOG_LEVEL:-"--v=2"} ${NPD_TEST_ARGS:-}"
|
||||
flags+=" --logtostderr"
|
||||
flags+=" --config.system-log-monitor=${km_config},${dm_config},${sm_config}"
|
||||
flags+=" --config.system-stats-monitor=${ssm_config}"
|
||||
flags+=" --config.custom-plugin-monitor=${custom_km_config},${custom_sm_config}"
|
||||
local -r npd_port=${NODE_PROBLEM_DETECTOR_PORT:-20256}
|
||||
flags+=" --port=${npd_port}"
|
||||
if [[ -n "${EXTRA_NPD_ARGS:-}" ]]; then
|
||||
flags+=" ${EXTRA_NPD_ARGS}"
|
||||
fi
|
||||
fi
|
||||
flags+=" --apiserver-override=https://${KUBERNETES_MASTER_NAME}?inClusterConfig=false&auth=/var/lib/node-problem-detector/kubeconfig"
|
||||
|
||||
# Write the systemd service file for node problem detector.
|
||||
cat <<EOF >/etc/systemd/system/node-problem-detector.service
|
||||
[Unit]
|
||||
Description=Kubernetes node problem detector
|
||||
Requires=network-online.target
|
||||
After=network-online.target
|
||||
|
||||
[Service]
|
||||
Restart=always
|
||||
RestartSec=10
|
||||
ExecStart=${npd_bin} ${flags}
|
||||
|
||||
[Install]
|
||||
WantedBy=multi-user.target
|
||||
EOF
|
||||
|
||||
systemctl start node-problem-detector.service
|
||||
}
|
||||
|
||||
# Create the log file and set its properties.
|
||||
#
|
||||
# $1 is the file to create.
|
||||
|
|
@ -2966,16 +2878,6 @@ function start-kube-addons {
|
|||
update-event-exporter ${event_exporter_yaml}
|
||||
update-prometheus-to-sd-parameters ${event_exporter_yaml}
|
||||
fi
|
||||
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "daemonset" ]]; then
|
||||
setup-addon-manifests "addons" "node-problem-detector"
|
||||
fi
|
||||
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
|
||||
# Setup role binding(s) for standalone node problem detector.
|
||||
if [[ -n "${NODE_PROBLEM_DETECTOR_TOKEN:-}" ]]; then
|
||||
setup-addon-manifests "addons" "node-problem-detector/standalone"
|
||||
fi
|
||||
setup-addon-manifests "addons" "node-problem-detector/kubelet-user-standalone" "node-problem-detector"
|
||||
fi
|
||||
if echo "${ADMISSION_CONTROL:-}" | grep -q "LimitRanger"; then
|
||||
setup-addon-manifests "admission-controls" "limit-range" "gce"
|
||||
fi
|
||||
|
|
@ -3615,16 +3517,6 @@ function main() {
|
|||
log-wrap 'CreateNodePKI' create-node-pki
|
||||
log-wrap 'CreateKubeletKubeconfig' create-kubelet-kubeconfig "${KUBERNETES_MASTER_NAME}"
|
||||
log-wrap 'CreateKubeproxyUserKubeconfig' create-kubeproxy-user-kubeconfig
|
||||
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
|
||||
if [[ -n "${NODE_PROBLEM_DETECTOR_TOKEN:-}" ]]; then
|
||||
log-wrap 'CreateNodeProblemDetectorKubeconfig' create-node-problem-detector-kubeconfig "${KUBERNETES_MASTER_NAME}"
|
||||
elif [[ -f "/var/lib/kubelet/kubeconfig" ]]; then
|
||||
log-wrap 'CreateNodeProblemDetectorKubeconfigFromKubelet' create-node-problem-detector-kubeconfig-from-kubelet
|
||||
else
|
||||
echo "Either NODE_PROBLEM_DETECTOR_TOKEN or /var/lib/kubelet/kubeconfig must be set"
|
||||
exit 1
|
||||
fi
|
||||
fi
|
||||
fi
|
||||
|
||||
log-wrap 'DetectCgroupConfig' detect-cgroup-config
|
||||
|
|
@ -3679,9 +3571,6 @@ function main() {
|
|||
log-wrap 'UpdateLegacyAddonNodeLabels' update-legacy-addon-node-labels &
|
||||
else
|
||||
log-wrap 'StartKubeProxy' start-kube-proxy
|
||||
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
|
||||
log-wrap 'StartNodeProblemDetector' start-node-problem-detector
|
||||
fi
|
||||
fi
|
||||
log-wrap 'ResetMotd' reset-motd
|
||||
|
||||
|
|
|
|||
|
|
@ -27,9 +27,7 @@ set -o pipefail
|
|||
DEFAULT_CNI_VERSION='v1.9.1'
|
||||
# CNI HASH for amd64 sha512
|
||||
DEFAULT_CNI_HASH='3ea8a76852b7ddc62c087a34cccca2cb29822ca24214928cd172b28bf9d1486000ba3eb71a156445af31ff6a92c1dc3e01e702546c6ee016ef13fae06ccfb8fc'
|
||||
DEFAULT_NPD_VERSION='v1.34.0'
|
||||
DEFAULT_NPD_HASH_AMD64='3c55ff6ffadd77dbc3df3774d13164587103ca87c8b6914f5c71c87d8f498b78621e0c96538bb3c69f8f1b4194a6da553aa56b1b52001a7d9a67776ac24e80bd'
|
||||
DEFAULT_NPD_HASH_ARM64='ca1d34e64b80f6b2bdf86cfde95154122d6e14c707a748ea6fc414a55f391b1bb572a96b6b2c285996af0232917fa87e14e037125aa03a62247383af3e48c095'
|
||||
|
||||
DEFAULT_CRICTL_VERSION='v1.36.0'
|
||||
DEFAULT_CRICTL_AMD64_SHA512='43ac5425d264547bc9d9c9e31c74624d9c2a63bf7de4e77fe79517e0c927ea77ee3951a2f662920bc771599a0dc4f2859b6225c3621c7cafff952e63c83d686d'
|
||||
DEFAULT_CRICTL_ARM64_SHA512='485aa86f327c23cb0508e814e568bda793d291865c5cec3337ae5467a51898e9ab21a6bd38b73a6b219058bb34c9b4e7128e57360a2552b74a552e7ea1936f32'
|
||||
|
|
@ -293,56 +291,6 @@ function install-gci-mounter-tools {
|
|||
mkdir -p "${CONTAINERIZED_MOUNTER_HOME}/rootfs/var/lib/kubelet"
|
||||
}
|
||||
|
||||
# Install node problem detector binary.
|
||||
function install-node-problem-detector {
|
||||
if [[ -n "${NODE_PROBLEM_DETECTOR_VERSION:-}" ]]; then
|
||||
local -r npd_version="${NODE_PROBLEM_DETECTOR_VERSION}"
|
||||
local -r npd_hash="${NODE_PROBLEM_DETECTOR_TAR_HASH}"
|
||||
else
|
||||
local -r npd_version="${DEFAULT_NPD_VERSION}"
|
||||
case "${HOST_PLATFORM}/${HOST_ARCH}" in
|
||||
linux/amd64)
|
||||
local -r npd_hash="${DEFAULT_NPD_HASH_AMD64}"
|
||||
;;
|
||||
linux/arm64)
|
||||
local -r npd_hash="${DEFAULT_NPD_HASH_ARM64}"
|
||||
;;
|
||||
# no other architectures are supported currently.
|
||||
# Assumption is that this script only runs on linux,
|
||||
# see cluster/gce/windows/k8s-node-setup.psm1 for windows
|
||||
# https://github.com/kubernetes/node-problem-detector/releases/
|
||||
*)
|
||||
echo "Unrecognized version and platform/arch combination:"
|
||||
echo "$DEFAULT_NPD_VERSION $HOST_PLATFORM/$HOST_ARCH"
|
||||
echo "Set NODE_PROBLEM_DETECTOR_VERSION and NODE_PROBLEM_DETECTOR_TAR_HASH to overwrite"
|
||||
exit 1
|
||||
;;
|
||||
esac
|
||||
fi
|
||||
local -r npd_tar="node-problem-detector-${npd_version}-${HOST_PLATFORM}_${HOST_ARCH}.tar.gz"
|
||||
|
||||
if is-preloaded "${npd_tar}" "${npd_hash}"; then
|
||||
echo "${npd_tar} is preloaded."
|
||||
return
|
||||
fi
|
||||
|
||||
if [[ -n "${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}" ]]; then
|
||||
echo "Downloading ${npd_tar} from ${NODE_PROBLEM_DETECTOR_RELEASE_PATH}."
|
||||
local -r download_path="${NODE_PROBLEM_DETECTOR_RELEASE_PATH}/node-problem-detector/${npd_tar}"
|
||||
else
|
||||
echo "Downloading ${npd_tar} from github."
|
||||
local -r download_path="https://github.com/kubernetes/node-problem-detector/releases/download/${npd_version}/${npd_tar}"
|
||||
fi
|
||||
download-or-bust "${npd_hash}" "${download_path}"
|
||||
local -r npd_dir="${KUBE_HOME}/node-problem-detector"
|
||||
mkdir -p "${npd_dir}"
|
||||
tar xzf "${KUBE_HOME}/${npd_tar}" -C "${npd_dir}" --overwrite
|
||||
mv "${npd_dir}/bin"/* "${KUBE_BIN}"
|
||||
chmod a+x "${KUBE_BIN}/node-problem-detector"
|
||||
rmdir "${npd_dir}/bin"
|
||||
rm -f "${KUBE_HOME}/${npd_tar}"
|
||||
}
|
||||
|
||||
function install-cni-binaries {
|
||||
local -r cni_version=${CNI_VERSION:-$DEFAULT_CNI_VERSION}
|
||||
if [[ -n "${CNI_VERSION:-}" ]]; then
|
||||
|
|
@ -795,11 +743,6 @@ function install-kube-binary-config {
|
|||
mv "${KUBE_HOME}/kubernetes/kubernetes-src.tar.gz" "${KUBE_HOME}"
|
||||
fi
|
||||
|
||||
if [[ "${KUBERNETES_MASTER:-}" == "false" ]] && \
|
||||
[[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
|
||||
log-wrap "InstallNodeProblemDetector" install-node-problem-detector
|
||||
fi
|
||||
|
||||
if [[ "${NETWORK_PROVIDER:-}" == "kubenet" ]] || \
|
||||
[[ "${NETWORK_PROVIDER:-}" == "cni" ]]; then
|
||||
log-wrap "InstallCNIBinaries" install-cni-binaries
|
||||
|
|
|
|||
|
|
@ -114,11 +114,6 @@ function upgrade-master() {
|
|||
|
||||
function upgrade-master-env() {
|
||||
echo "== Upgrading master environment variables. =="
|
||||
# Generate the node problem detector token if it isn't present on the original
|
||||
# master.
|
||||
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" && "${NODE_PROBLEM_DETECTOR_TOKEN:-}" == "" ]]; then
|
||||
NODE_PROBLEM_DETECTOR_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
|
||||
fi
|
||||
}
|
||||
|
||||
function wait-for-master() {
|
||||
|
|
@ -193,7 +188,6 @@ function get-node-os() {
|
|||
#
|
||||
# Vars set:
|
||||
# KUBE_PROXY_TOKEN
|
||||
# NODE_PROBLEM_DETECTOR_TOKEN
|
||||
# CA_CERT_BASE64
|
||||
# EXTRA_DOCKER_OPTS
|
||||
# KUBELET_CERT_BASE64
|
||||
|
|
@ -232,7 +226,6 @@ function setup-base-image() {
|
|||
# SANITIZED_VERSION
|
||||
# INSTANCE_GROUPS
|
||||
# KUBE_PROXY_TOKEN
|
||||
# NODE_PROBLEM_DETECTOR_TOKEN
|
||||
# CA_CERT_BASE64
|
||||
# EXTRA_DOCKER_OPTS
|
||||
# KUBELET_CERT_BASE64
|
||||
|
|
@ -257,8 +250,6 @@ function prepare-node-upgrade() {
|
|||
node_env=$(get-node-env)
|
||||
KUBE_PROXY_TOKEN=$(get-env-val "${node_env}" "KUBE_PROXY_TOKEN")
|
||||
export KUBE_PROXY_TOKEN
|
||||
NODE_PROBLEM_DETECTOR_TOKEN=$(get-env-val "${node_env}" "NODE_PROBLEM_DETECTOR_TOKEN")
|
||||
export NODE_PROBLEM_DETECTOR_TOKEN
|
||||
CA_CERT_BASE64=$(get-env-val "${node_env}" "CA_CERT")
|
||||
export CA_CERT_BASE64
|
||||
EXTRA_DOCKER_OPTS=$(get-env-val "${node_env}" "EXTRA_DOCKER_OPTS")
|
||||
|
|
@ -286,14 +277,6 @@ function prepare-node-upgrade() {
|
|||
|
||||
function upgrade-node-env() {
|
||||
echo "== Upgrading node environment variables. =="
|
||||
# Get the node problem detector token from master if it isn't present on
|
||||
# the original node.
|
||||
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" && "${NODE_PROBLEM_DETECTOR_TOKEN:-}" == "" ]]; then
|
||||
detect-master
|
||||
local master_env
|
||||
master_env=$(get-master-env)
|
||||
NODE_PROBLEM_DETECTOR_TOKEN=$(get-env-val "${master_env}" "NODE_PROBLEM_DETECTOR_TOKEN")
|
||||
fi
|
||||
}
|
||||
|
||||
# Upgrades a single node.
|
||||
|
|
|
|||
|
|
@ -1128,11 +1128,6 @@ DOCKER_REGISTRY_MIRROR_URL: $(yaml-quote "${DOCKER_REGISTRY_MIRROR_URL:-}")
|
|||
ENABLE_L7_LOADBALANCING: $(yaml-quote "${ENABLE_L7_LOADBALANCING:-none}")
|
||||
ENABLE_CLUSTER_LOGGING: $(yaml-quote "${ENABLE_CLUSTER_LOGGING:-false}")
|
||||
ENABLE_AUTH_PROVIDER_GCP: $(yaml-quote "${ENABLE_AUTH_PROVIDER_GCP:-true}")
|
||||
ENABLE_NODE_PROBLEM_DETECTOR: $(yaml-quote "${ENABLE_NODE_PROBLEM_DETECTOR:-none}")
|
||||
NODE_PROBLEM_DETECTOR_VERSION: $(yaml-quote "${NODE_PROBLEM_DETECTOR_VERSION:-}")
|
||||
NODE_PROBLEM_DETECTOR_TAR_HASH: $(yaml-quote "${NODE_PROBLEM_DETECTOR_TAR_HASH:-}")
|
||||
NODE_PROBLEM_DETECTOR_RELEASE_PATH: $(yaml-quote "${NODE_PROBLEM_DETECTOR_RELEASE_PATH:-}")
|
||||
NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS: $(yaml-quote "${NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS:-}")
|
||||
CNI_STORAGE_URL_BASE: $(yaml-quote "${CNI_STORAGE_URL_BASE:-}")
|
||||
CNI_TAR_PREFIX: $(yaml-quote "${CNI_TAR_PREFIX:-}")
|
||||
CNI_VERSION: $(yaml-quote "${CNI_VERSION:-}")
|
||||
|
|
@ -1151,7 +1146,6 @@ ENABLE_DNS_HORIZONTAL_AUTOSCALER: $(yaml-quote "${ENABLE_DNS_HORIZONTAL_AUTOSCAL
|
|||
KUBE_PROXY_TOKEN: $(yaml-quote "${KUBE_PROXY_TOKEN:-}")
|
||||
KUBE_PROXY_MODE: $(yaml-quote "${KUBE_PROXY_MODE:-iptables}")
|
||||
DETECT_LOCAL_MODE: $(yaml-quote "${DETECT_LOCAL_MODE:-}")
|
||||
NODE_PROBLEM_DETECTOR_TOKEN: $(yaml-quote "${NODE_PROBLEM_DETECTOR_TOKEN:-}")
|
||||
ADMISSION_CONTROL: $(yaml-quote "${ADMISSION_CONTROL:-}")
|
||||
MASTER_IP_RANGE: $(yaml-quote "${MASTER_IP_RANGE}")
|
||||
RUNTIME_CONFIG: $(yaml-quote "${RUNTIME_CONFIG}")
|
||||
|
|
@ -1615,13 +1609,6 @@ WINDOWS_INFRA_CONTAINER: $(yaml-quote "${WINDOWS_INFRA_CONTAINER}")
|
|||
WINDOWS_ENABLE_PIGZ: $(yaml-quote "${WINDOWS_ENABLE_PIGZ}")
|
||||
WINDOWS_ENABLE_HYPERV: $(yaml-quote "${WINDOWS_ENABLE_HYPERV}")
|
||||
ENABLE_AUTH_PROVIDER_GCP: $(yaml-quote "${ENABLE_AUTH_PROVIDER_GCP}")
|
||||
ENABLE_NODE_PROBLEM_DETECTOR: $(yaml-quote "${WINDOWS_ENABLE_NODE_PROBLEM_DETECTOR}")
|
||||
NODE_PROBLEM_DETECTOR_VERSION: $(yaml-quote "${NODE_PROBLEM_DETECTOR_VERSION}")
|
||||
NODE_PROBLEM_DETECTOR_TAR_HASH: $(yaml-quote "${NODE_PROBLEM_DETECTOR_TAR_HASH}")
|
||||
NODE_PROBLEM_DETECTOR_RELEASE_PATH: $(yaml-quote "${NODE_PROBLEM_DETECTOR_RELEASE_PATH}")
|
||||
NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS: $(yaml-quote "${WINDOWS_NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS}")
|
||||
NODE_PROBLEM_DETECTOR_TOKEN: $(yaml-quote "${NODE_PROBLEM_DETECTOR_TOKEN:-}")
|
||||
WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE: $(yaml-quote "${WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE}")
|
||||
AUTH_PROVIDER_GCP_STORAGE_PATH: $(yaml-quote "${AUTH_PROVIDER_GCP_STORAGE_PATH}")
|
||||
AUTH_PROVIDER_GCP_VERSION: $(yaml-quote "${AUTH_PROVIDER_GCP_VERSION}")
|
||||
AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64: $(yaml-quote "${AUTH_PROVIDER_GCP_HASH_WINDOWS_AMD64}")
|
||||
|
|
@ -2080,7 +2067,6 @@ function parse-master-env() {
|
|||
local master_env
|
||||
master_env=$(get-master-env)
|
||||
KUBE_PROXY_TOKEN=$(get-env-val "${master_env}" "KUBE_PROXY_TOKEN")
|
||||
NODE_PROBLEM_DETECTOR_TOKEN=$(get-env-val "${master_env}" "NODE_PROBLEM_DETECTOR_TOKEN")
|
||||
CA_CERT_BASE64=$(get-env-val "${master_env}" "CA_CERT")
|
||||
CA_KEY_BASE64=$(get-env-val "${master_env}" "CA_KEY")
|
||||
KUBEAPISERVER_CERT_BASE64=$(get-env-val "${master_env}" "KUBEAPISERVER_CERT")
|
||||
|
|
@ -2903,9 +2889,6 @@ function create-master() {
|
|||
# computer) can forget it later. This should disappear with
|
||||
# http://issue.k8s.io/3168
|
||||
KUBE_PROXY_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
|
||||
if [[ "${ENABLE_NODE_PROBLEM_DETECTOR:-}" == "standalone" ]]; then
|
||||
NODE_PROBLEM_DETECTOR_TOKEN=$(dd if=/dev/urandom bs=128 count=1 2>/dev/null | base64 | tr -d "=+/" | dd bs=32 count=1 2>/dev/null)
|
||||
fi
|
||||
|
||||
# Reserve the master's IP so that it can later be transferred to another VM
|
||||
# without disrupting the kubelets.
|
||||
|
|
|
|||
|
|
@ -163,14 +163,12 @@ try {
|
|||
Configure-Crictl
|
||||
Setup-ContainerRuntime
|
||||
DownloadAndInstall-KubernetesBinaries
|
||||
DownloadAndInstall-NodeProblemDetector
|
||||
DownloadAndInstall-CSIProxyBinaries
|
||||
DownloadAndInstall-AuthProviderGcpBinary
|
||||
Start-CSIProxy
|
||||
Create-NodePki
|
||||
Create-KubeletKubeconfig
|
||||
Create-KubeproxyKubeconfig
|
||||
Create-NodeProblemDetectorKubeConfig
|
||||
Create-AuthProviderGcpConfig
|
||||
Set-PodCidr
|
||||
Configure-HostNetworkingService
|
||||
|
|
@ -178,7 +176,6 @@ try {
|
|||
Configure-HostDnsConf
|
||||
Configure-GcePdTools
|
||||
Configure-Kubelet
|
||||
Configure-NodeProblemDetector
|
||||
|
||||
# Even if Logging agent is already installed, the function will still [re]start the service.
|
||||
if (IsLoggingEnabled $kube_env) {
|
||||
|
|
|
|||
|
|
@ -295,8 +295,6 @@ function Set-EnvironmentVars {
|
|||
"INFRA_CONTAINER" = ${kube_env}['WINDOWS_INFRA_CONTAINER']
|
||||
"WINDOWS_ENABLE_PIGZ" = ${kube_env}['WINDOWS_ENABLE_PIGZ']
|
||||
"WINDOWS_ENABLE_HYPERV" = ${kube_env}['WINDOWS_ENABLE_HYPERV']
|
||||
"ENABLE_NODE_PROBLEM_DETECTOR" = ${kube_env}['ENABLE_NODE_PROBLEM_DETECTOR']
|
||||
"NODEPROBLEMDETECTOR_KUBECONFIG_FILE" = ${kube_env}['WINDOWS_NODEPROBLEMDETECTOR_KUBECONFIG_FILE']
|
||||
"ENABLE_AUTH_PROVIDER_GCP" = ${kube_env}['ENABLE_AUTH_PROVIDER_GCP']
|
||||
"AUTH_PROVIDER_GCP_STORAGE_PATH" = ${kube_env}['AUTH_PROVIDER_GCP_STORAGE_PATH']
|
||||
"AUTH_PROVIDER_GCP_VERSION" = ${kube_env}['AUTH_PROVIDER_GCP_VERSION']
|
||||
|
|
@ -1484,140 +1482,6 @@ function Install-Pigz {
|
|||
}
|
||||
}
|
||||
|
||||
# Node Problem Detector Resources
|
||||
$NPD_SERVICE = "node-problem-detector"
|
||||
$DEFAULT_NPD_VERSION = '0.8.10-gke0.1'
|
||||
$DEFAULT_NPD_RELEASE_PATH = 'https://storage.googleapis.com/gke-release/winnode'
|
||||
$DEFAULT_NPD_HASH = '97ddfe3544da9e02a1cfb55d24f329eb29d606fca7fbbf800415d5de9dbc29a00563f8e0d1919595c8e316fd989d45b09b13c07be528841fc5fd37e21d016a2d'
|
||||
|
||||
# Install Node Problem Detector (NPD).
|
||||
# NPD analyzes the host for problems that can disrupt workloads.
|
||||
# https://github.com/kubernetes/node-problem-detector
|
||||
function DownloadAndInstall-NodeProblemDetector {
|
||||
if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone") {
|
||||
if (ShouldWrite-File "${env:NODE_DIR}\node-problem-detector.exe") {
|
||||
$npd_version = $DEFAULT_NPD_VERSION
|
||||
$npd_hash = $DEFAULT_NPD_HASH
|
||||
if (-not [string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_VERSION'])) {
|
||||
$npd_version = ${kube_env}['NODE_PROBLEM_DETECTOR_VERSION']
|
||||
$npd_hash = ${kube_env}['NODE_PROBLEM_DETECTOR_TAR_HASH']
|
||||
}
|
||||
$npd_release_path = $DEFAULT_NPD_RELEASE_PATH
|
||||
if (-not [string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_RELEASE_PATH'])) {
|
||||
$npd_release_path = ${kube_env}['NODE_PROBLEM_DETECTOR_RELEASE_PATH']
|
||||
}
|
||||
|
||||
$npd_tar = "node-problem-detector-v${npd_version}-windows_amd64.tar.gz"
|
||||
|
||||
Log-Output "Downloading ${npd_tar}."
|
||||
|
||||
$npd_dir = "${env:K8S_DIR}\node-problem-detector"
|
||||
New-Item -Path $npd_dir -ItemType Directory -Force -Confirm:$false
|
||||
|
||||
MustDownload-File `
|
||||
-URLs "${npd_release_path}/node-problem-detector/${npd_tar}" `
|
||||
-Hash $npd_hash `
|
||||
-Algorithm SHA512 `
|
||||
-OutFile "${npd_dir}\${npd_tar}"
|
||||
|
||||
tar xzvf "${npd_dir}\${npd_tar}" -C $npd_dir
|
||||
Move-Item "${npd_dir}\bin\*" "${env:NODE_DIR}\" -Force -Confirm:$false
|
||||
Remove-Item "${npd_dir}\bin" -Force -Confirm:$false
|
||||
Remove-Item "${npd_dir}\${npd_tar}" -Force -Confirm:$false
|
||||
}
|
||||
else {
|
||||
Log-Output "Node Problem Detector already installed."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Creates the node-problem-detector user kubeconfig file at
|
||||
# $env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE (if defined).
|
||||
#
|
||||
# Create-NodePki() must be called first.
|
||||
#
|
||||
# Required ${kube_env} keys:
|
||||
# CA_CERT
|
||||
# NODE_PROBLEM_DETECTOR_TOKEN
|
||||
function Create-NodeProblemDetectorKubeConfig {
|
||||
if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone") {
|
||||
if (-not [string]::IsNullOrEmpty(${kube_env]['NODE_PROBLEM_DETECTOR_TOKEN']})) {
|
||||
Log-Output "Create-NodeProblemDetectorKubeConfig using Node Problem Detector token"
|
||||
Create-Kubeconfig -Name 'node-problem-detector' `
|
||||
-Path ${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE} `
|
||||
-Token ${kube_env}['NODE_PROBLEM_DETECTOR_TOKEN']
|
||||
} elseif (Test-Path ${env:BOOTSTRAP_KUBECONFIG}) {
|
||||
Log-Output "Create-NodeProblemDetectorKubeConfig creating kubeconfig from kubelet kubeconfig"
|
||||
Copy-Item ${env:BOOTSTRAP_KUBECONFIG} -Destination ${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE}
|
||||
Log-Output ("node-problem-detector bootstrap kubeconfig:`n" +
|
||||
"$(Get-Content -Raw ${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE})")
|
||||
} else {
|
||||
Log-Output "Either NODE_PROBLEM_DETECTOR_TOKEN or ${env:BOOTSTRAP_KUBECONFIG} must be set"
|
||||
exit 1
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# Configures NPD to run with the bundled monitor configs and report against the Kubernetes api server.
|
||||
function Configure-NodeProblemDetector {
|
||||
$npd_bin = "${env:NODE_DIR}\node-problem-detector.exe"
|
||||
if ("${env:ENABLE_NODE_PROBLEM_DETECTOR}" -eq "standalone" -and (Test-Path $npd_bin)) {
|
||||
$npd_svc = Get-Service -Name $NPD_SERVICE -ErrorAction SilentlyContinue
|
||||
if ($npd_svc -eq $null) {
|
||||
$npd_dir = "${env:K8S_DIR}\node-problem-detector"
|
||||
$npd_logs_dir = "${env:LOGS_DIR}\node-problem-detector"
|
||||
|
||||
New-Item -Path $npd_logs_dir -Type Directory -Force -Confirm:$false
|
||||
|
||||
$flags = ''
|
||||
if ([string]::IsNullOrEmpty(${kube_env}['NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS'])) {
|
||||
$system_log_monitors = @()
|
||||
$system_stats_monitors = @()
|
||||
$custom_plugin_monitors = @()
|
||||
|
||||
# Custom Plugin Monitors
|
||||
$custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-kubelet.json")
|
||||
$custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-kubeproxy.json")
|
||||
$custom_plugin_monitors += @("${npd_dir}\config\windows-defender-monitor.json")
|
||||
|
||||
# System Stats Monitors
|
||||
$system_stats_monitors += @("${npd_dir}\config\windows-system-stats-monitor.json")
|
||||
|
||||
# NPD Configuration for CRI monitor
|
||||
$system_log_monitors += @("${npd_dir}\config\windows-containerd-monitor-filelog.json")
|
||||
$custom_plugin_monitors += @("${npd_dir}\config\windows-health-checker-containerd.json")
|
||||
|
||||
$flags="--v=2 --port=20256 --log_dir=${npd_logs_dir}"
|
||||
if ($system_log_monitors.count -gt 0) {
|
||||
$flags+=" --config.system-log-monitor={0}" -f ($system_log_monitors -join ",")
|
||||
}
|
||||
if ($system_stats_monitors.count -gt 0) {
|
||||
$flags+=" --config.system-stats-monitor={0}" -f ($system_stats_monitors -join ",")
|
||||
}
|
||||
if ($custom_plugin_monitors.count -gt 0) {
|
||||
$flags+=" --config.custom-plugin-monitor={0}" -f ($custom_plugin_monitors -join ",")
|
||||
}
|
||||
}
|
||||
else {
|
||||
$flags = ${kube_env}['NODE_PROBLEM_DETECTOR_CUSTOM_FLAGS']
|
||||
}
|
||||
$kubernetes_master_name = ${kube_env}['KUBERNETES_MASTER_NAME']
|
||||
$flags = "${flags} --apiserver-override=`"https://${kubernetes_master_name}?inClusterConfig=false&auth=${env:NODEPROBLEMDETECTOR_KUBECONFIG_FILE}`""
|
||||
|
||||
Log-Output "Creating service: ${NPD_SERVICE}"
|
||||
Log-Output "${npd_bin} ${flags}"
|
||||
sc.exe create $NPD_SERVICE binpath= "${npd_bin} ${flags}" displayName= "Node Problem Detector"
|
||||
sc.exe failure $NPD_SERVICE reset= 30 actions= restart/5000
|
||||
sc.exe start $NPD_SERVICE
|
||||
|
||||
Write-VerboseServiceInfoToConsole -Service $NPD_SERVICE
|
||||
}
|
||||
else {
|
||||
Log-Output "${NPD_SERVICE} already configured."
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
# TODO(pjh): move the logging agent code below into a separate
|
||||
# module; it was put here temporarily to avoid disrupting the file layout in
|
||||
# the K8s release machinery.
|
||||
|
|
@ -1872,21 +1736,6 @@ $FLUENTBIT_CONFIG = @'
|
|||
Parser docker
|
||||
Parser containerd
|
||||
|
||||
# Log line format: [IWEF]mmdd hh:mm:ss.uuuuuu threadid file:line] msg
|
||||
# Example:
|
||||
# I0716 02:08:55.559351 3356 log_spam.go:42] Command line arguments:
|
||||
[INPUT]
|
||||
Name tail
|
||||
Alias node-problem-detector
|
||||
Tag node-problem-detector
|
||||
Mem_Buf_Limit 5MB
|
||||
Skip_Long_Lines On
|
||||
Refresh_Interval 5
|
||||
Path C:\etc\kubernetes\logs\node-problem-detector\*.log.INFO*
|
||||
DB /var/run/google-fluentbit/pos-files/node-problem-detector.db
|
||||
Multiline On
|
||||
Parser_Firstline glog
|
||||
|
||||
# Example:
|
||||
# I0928 03:15:50.440223 4880 main.go:51] Starting CSI-Proxy Server ...
|
||||
[INPUT]
|
||||
|
|
|
|||
|
|
@ -42,8 +42,8 @@ readonly node_ssh_supported_providers="gce gke aws"
|
|||
readonly gcloud_supported_providers="gce gke"
|
||||
|
||||
readonly master_logfiles="kube-apiserver.log kube-apiserver-audit.log kube-scheduler.log kube-controller-manager.log cloud-controller-manager.log etcd.log etcd-events.log glbc.log cluster-autoscaler.log kube-addon-manager.log konnectivity-server.log fluentd.log kubelet.cov"
|
||||
readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log node-problem-detector.log kubelet.cov kube-network-policies.log"
|
||||
readonly node_systemd_services="node-problem-detector"
|
||||
readonly node_logfiles="kube-proxy.log containers/konnectivity-agent-*.log fluentd.log kubelet.cov kube-network-policies.log"
|
||||
readonly node_systemd_services=""
|
||||
readonly hollow_node_logfiles="kubelet-hollow-node-*.log kubeproxy-hollow-node-*.log npd-hollow-node-*.log"
|
||||
readonly aws_logfiles="cloud-init-output.log"
|
||||
readonly gce_logfiles="startupscript.log"
|
||||
|
|
|
|||
|
|
@ -92,7 +92,7 @@ func updateImageAllowList(ctx context.Context) {
|
|||
}
|
||||
|
||||
func getNodeProblemDetectorImage() string {
|
||||
const defaultImage string = "registry.k8s.io/node-problem-detector/node-problem-detector:v1.34.0"
|
||||
const defaultImage string = "registry.k8s.io/node-problem-detector/node-problem-detector:v1.35.2"
|
||||
image := os.Getenv("NODE_PROBLEM_DETECTOR_IMAGE")
|
||||
if image == "" {
|
||||
image = defaultImage
|
||||
|
|
|
|||
|
|
@ -99,7 +99,7 @@ spec:
|
|||
cpu: {{hollow_proxy_millicpu}}m
|
||||
memory: {{hollow_proxy_mem_Ki}}Ki
|
||||
- name: hollow-node-problem-detector
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v1.34.0
|
||||
image: registry.k8s.io/node-problem-detector/node-problem-detector:v1.35.2
|
||||
env:
|
||||
- name: NODE_NAME
|
||||
valueFrom:
|
||||
|
|
|
|||
Loading…
Reference in a new issue