Skip to content

Commit

Permalink
[FLINK-12123][tests] Upgrade Jepsen to 0.1.13
Browse files Browse the repository at this point in the history
Upgrade Jepsen dependency to get support for Debian Stretch (Jessie has reached
EOL):
  * Adapt for new checker interface
  * Install Marathon manually because there are no packages for Debian Stretch
  * Upgrade Marathon to 1.7 because I cannot find the 1.6.322 binaries anywhere
  * Update Dockerfiles to Debian Stretch

This closes apache#8131.
  • Loading branch information
GJL committed Apr 10, 2019
1 parent bdb30a7 commit c3bd1bd
Show file tree
Hide file tree
Showing 9 changed files with 105 additions and 58 deletions.
6 changes: 2 additions & 4 deletions flink-jepsen/docker/Dockerfile-control
Original file line number Diff line number Diff line change
Expand Up @@ -16,11 +16,9 @@
# limitations under the License.
################################################################################

FROM debian:jessie
FROM debian:stretch

RUN echo "deb http:https://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list && \
apt-get update && \
apt-get install -y -t jessie-backports openjdk-8-jdk && \
RUN apt-get update && \
apt-get install -qqy \
less \
libjna-java \
Expand Down
32 changes: 27 additions & 5 deletions flink-jepsen/docker/Dockerfile-db
Original file line number Diff line number Diff line change
Expand Up @@ -16,12 +16,34 @@
# limitations under the License.
################################################################################

FROM debian:jessie
FROM debian:stretch

RUN echo "deb http:https://http.debian.net/debian jessie-backports main" >> /etc/apt/sources.list && \
apt-get update && \
apt-get install -y -t jessie-backports openjdk-8-jdk && \
apt-get install -y apt-utils bzip2 curl faketime iproute iptables iputils-ping less libzip2 logrotate man man-db net-tools ntpdate psmisc python rsyslog runit sudo sysvinit sysvinit-core sysvinit-utils tar unzip vim wget
RUN apt-get update && \
apt-get install -qqy \
apt-utils \
bzip2 \
curl \
faketime \
gnupg \
iproute \
iptables \
iputils-ping \
less \
libzip4 \
logrotate \
man \
man-db \
net-tools \
ntpdate \
openjdk-8-jdk \
psmisc python \
rsyslog \
runit \
sudo \
tar \
unzip \
vim \
wget

RUN apt-get update && \
apt-get -y install openssh-server && \
Expand Down
2 changes: 1 addition & 1 deletion flink-jepsen/project.clj
Original file line number Diff line number Diff line change
Expand Up @@ -22,7 +22,7 @@
:dependencies [[org.clojure/clojure "1.9.0"],
[cheshire "5.8.0"]
[clj-http "3.8.0"]
[jepsen "0.1.11"],
[jepsen "0.1.13"],
[jepsen.zookeeper "0.1.0"]
[org.clojure/data.xml "0.0.8"]
[zookeeper-clj "0.9.4" :exclusions [org.slf4j/slf4j-log4j12]]]
Expand Down
43 changes: 23 additions & 20 deletions flink-jepsen/src/jepsen/flink/checker.clj
Original file line number Diff line number Diff line change
Expand Up @@ -149,11 +149,9 @@
; ignore other client operations
this)))))

(defn job-running-within-grace-period
(defn- job-running-within-grace-period
([job-running-healthy-threshold job-recovery-grace-period job-cancellation-grace-period]
(JobRunningWithinGracePeriod. {} 0 nil job-running-healthy-threshold job-recovery-grace-period job-cancellation-grace-period false nil))
([job-running-healthy-threshold job-recovery-grace-period]
(job-running-within-grace-period job-running-healthy-threshold job-recovery-grace-period 10)))
(JobRunningWithinGracePeriod. {} 0 nil job-running-healthy-threshold job-recovery-grace-period job-cancellation-grace-period false nil)))

(defn- history->jobs-running?-value
[history]
Expand Down Expand Up @@ -216,19 +214,24 @@
history)))

(defn job-running-checker
[]
(reify
checker/Checker
(check [_ test model history _]
(let [job-ids (history->job-ids history)
individual-job-histories (map (partial history->single-job-history history) job-ids)
final-models (map (partial compute-final-model model) individual-job-histories)
inconsistent-or-unhealthy (or (empty? job-ids)
(some model/inconsistent? final-models)
(some (complement healthy?) final-models))
result-map (select-keys test [:nemesis-gen :deployment-mode])]
(if inconsistent-or-unhealthy
(into result-map {:valid? false
:final-models final-models})
(into result-map {:valid? true
:final-models final-models}))))))
([job-running-healthy-threshold job-recovery-grace-period]
(job-running-checker job-running-healthy-threshold job-recovery-grace-period 10))
([job-running-healthy-threshold job-recovery-grace-period job-cancellation-grace-period]
(reify
checker/Checker
(check [_ test history _]
(let [job-ids (history->job-ids history)
individual-job-histories (map (partial history->single-job-history history) job-ids)
model (job-running-within-grace-period job-running-healthy-threshold
job-recovery-grace-period
job-cancellation-grace-period)
final-models (map (partial compute-final-model model) individual-job-histories)
inconsistent-or-unhealthy (or (empty? job-ids)
(some model/inconsistent? final-models)
(some (complement healthy?) final-models))
result-map (select-keys test [:nemesis-gen :deployment-mode])]
(if inconsistent-or-unhealthy
(into result-map {:valid? false
:final-models final-models})
(into result-map {:valid? true
:final-models final-models})))))))
14 changes: 6 additions & 8 deletions flink-jepsen/src/jepsen/flink/flink.clj
Original file line number Diff line number Diff line change
Expand Up @@ -36,9 +36,9 @@
(def default-flink-dist-url "https://archive.apache.org/dist/flink/flink-1.6.0/flink-1.6.0-bin-hadoop28-scala_2.11.tgz")
(def hadoop-dist-url "https://archive.apache.org/dist/hadoop/common/hadoop-2.8.3/hadoop-2.8.3.tar.gz")
(def kafka-dist-url "http:https://mirror.funkfreundelandshut.de/apache/kafka/2.0.1/kafka_2.11-2.0.1.tgz")
(def deb-zookeeper-package "3.4.9-3+deb8u1")
(def deb-mesos-package "1.5.0-2.0.2")
(def deb-marathon-package "1.6.322")
(def deb-zookeeper-package "3.4.9-3+deb9u1")
(def deb-mesos-package "1.5.0-2.0.1")
(def marathon-dist-url "https://downloads.mesosphere.io/marathon/builds/1.7.189-48bfd6000/marathon-1.7.189-48bfd6000.tgz")

(def dbs
{:flink-yarn-job (fdb/yarn-job-db)
Expand All @@ -47,7 +47,7 @@
:flink-mesos-session (fdb/flink-mesos-app-master)
:hadoop (hadoop/db hadoop-dist-url)
:kafka (kafka/db kafka-dist-url)
:mesos (mesos/db deb-mesos-package deb-marathon-package)
:mesos (mesos/db deb-mesos-package marathon-dist-url)
:zookeeper (zk/db deb-zookeeper-package)})

(def poll-jobs-running {:type :invoke, :f :jobs-running?, :value nil})
Expand Down Expand Up @@ -84,9 +84,6 @@
:os debian/os
:db (fdb/combined-db dbs)
:nemesis (fn/nemesis)
:model (flink-checker/job-running-within-grace-period
job-running-healthy-threshold
job-recovery-grace-period)
:generator (let [stop (atom nil)]
(->> (fg/stoppable-generator stop (client-gen))
(gen/nemesis
Expand All @@ -95,7 +92,8 @@
job-running-healthy-threshold
job-recovery-grace-period))))
:client (create-client)
:checker (flink-checker/job-running-checker)})
:checker (flink-checker/job-running-checker job-running-healthy-threshold
job-recovery-grace-period)})
(assoc opts :concurrency 1)))

(defn- keys->allowed-values-help-text
Expand Down
35 changes: 24 additions & 11 deletions flink-jepsen/src/jepsen/flink/mesos.clj
Original file line number Diff line number Diff line change
Expand Up @@ -37,7 +37,8 @@

;;; Marathon

(def marathon-bin "/usr/bin/marathon")
(def marathon-install-dir "/opt/marathon")
(def marathon-bin (str marathon-install-dir "/bin/marathon"))
(def zk-marathon-namespace "marathon")
(def marathon-rest-port 8080)

Expand Down Expand Up @@ -69,6 +70,7 @@
(str "--master=" (zookeeper-uri test zk-namespace))
(str "--recovery_timeout=30secs")
(str "--work_dir=" slave-dir)
(str "--no-systemd_enable_support")
(str "--resources='cpus:8'")]))

(defn create-mesos-master-supervised-service!
Expand Down Expand Up @@ -118,21 +120,24 @@
(c/lit (str log-dir "/*"))
(c/lit (str slave-dir "/*"))))))

;;; Marathon functions

(defn install!
[test node mesos-version marathon-version]
(defn install-mesos!
[mesos-version]
(c/su
(debian/add-repo! :mesosphere
"deb http:https://repos.mesosphere.com/debian jessie main"
"deb http:https://repos.mesosphere.com/debian stretch main"
"keyserver.ubuntu.com"
"E56151BF")
(debian/install {:mesos mesos-version
:marathon marathon-version})
(c/exec :mkdir :-p "/var/run/mesos")
(debian/install {:mesos mesos-version})
(c/exec :mkdir :-p master-dir)
(c/exec :mkdir :-p slave-dir)))

;;; Marathon functions

(defn install-marathon!
[marathon-dist-url]
(c/su
(cu/install-archive! marathon-dist-url marathon-install-dir)))

(defn marathon-cmd
"Returns the command to run the marathon."
[test node]
Expand Down Expand Up @@ -168,11 +173,19 @@
[test]
(str "http:https://" (first (sort (:nodes test))) ":" marathon-rest-port))

;;; Mesos & Marathon DB

(defn install!
[mesos-version marathon-dist-url]
(c/su
(install-mesos! mesos-version)
(install-marathon! marathon-dist-url)))

(defn db
[mesos-version marathon-version]
[mesos-version marathon-dist-url]
(reify db/DB
(setup! [this test node]
(install! test node mesos-version marathon-version)
(install! mesos-version marathon-dist-url)
(start-master! test node)
(start-slave! test node)
(start-marathon! test node))
Expand Down
17 changes: 14 additions & 3 deletions flink-jepsen/src/jepsen/flink/nemesis.clj
Original file line number Diff line number Diff line change
Expand Up @@ -25,21 +25,32 @@
[jepsen.flink.client :refer :all]
[jepsen.flink.generator :as fgen]
[jepsen.flink.hadoop :as fh]
[jepsen.flink.zookeeper :refer :all]))
[jepsen.flink.zookeeper :refer :all]
[slingshot.slingshot :refer [try+]]))

(def job-submit-grace-period
"Period after job submission in which job managers must not fail."
60)

(defn- grepkill!
[pattern]
(try+
(cu/grepkill! pattern)
;; HACK:
;; On Debian Stretch, Jepsen's grepkill! throws an exception if the pattern does not match any
;; processes. We are swallowing the exception here because the process we are attempting to kill
;; might not be (re-)started yet.
;; For details, see https://github.com/jepsen-io/jepsen/issues/366
(catch [:type :jepsen.control/nonzero-exit :exit 123] _)))

(defn kill-processes
([pattern] (kill-processes rand-nth pattern))
([targeter pattern]
(reify nemesis/Nemesis
(setup! [this test] this)
(invoke! [this test op]
(let [nodes (-> test :nodes targeter ju/coll)]
(c/on-many nodes
(c/su (cu/grepkill! pattern)))
(c/on-many nodes (c/su (grepkill! pattern)))
(assoc op :value nodes)))
(teardown! [this test]))))

Expand Down
9 changes: 6 additions & 3 deletions flink-jepsen/src/jepsen/flink/utils.clj
Original file line number Diff line number Diff line change
Expand Up @@ -71,12 +71,14 @@

;;; runit process supervisor (http:https://smarden.org/runit/)

(def runit-version "2.1.2-3")
(def runit-version "2.1.2-9.2")
(def runit-systemd-version "2.1.2-9.2")

(defn- install-process-supervisor!
"Installs the process supervisor."
[]
(debian/install {:runit runit-version}))
(debian/install {:runit runit-version
:runit-systemd runit-systemd-version}))

(defn create-supervised-service!
"Registers a service with the process supervisor and starts it."
Expand All @@ -91,6 +93,7 @@
"exec 2>&1"
(str "exec " cmd)]) :> run-script)
(c/exec :chmod :+x run-script)
(c/exec :mkdir :-p "/etc/service")
(c/exec :ln :-sfT service-dir (str "/etc/service/" service-name)))))

(defn stop-supervised-service!
Expand All @@ -108,4 +111,4 @@
;; HACK:
;; Remove all symlinks in /etc/service except sshd.
;; This is only relevant when tests are run in Docker because there sshd is started using runit.
(meh (c/exec :find (c/lit (str "/etc/service -maxdepth 1 -type l ! -name 'sshd' -delete"))))))
(meh (c/exec :find (c/lit (str "/etc/service -mindepth 1 -maxdepth 1 -type l -not -name 'sshd' -delete"))))))
5 changes: 2 additions & 3 deletions flink-jepsen/test/jepsen/flink/checker_test.clj
Original file line number Diff line number Diff line change
Expand Up @@ -30,11 +30,10 @@
(is (= [false true false] (all-jobs-running?-history history)))))

(deftest job-running-checker-test
(let [checker (job-running-checker)
(let [checker (job-running-checker 3 60 10)
test {}
model (job-running-within-grace-period 3 60 10)
opts {}
check (fn [history] (checker/check checker test model history opts))
check (fn [history] (checker/check checker test history opts))
job-running-value {"3886d6b547969c3f15c53896bb496b8f" true}
job-not-running-value {"3886d6b547969c3f15c53896bb496b8f" false}]
(testing "Model should be inconsistent if job is not running after grace period."
Expand Down

0 comments on commit c3bd1bd

Please sign in to comment.