Added Post Incident Practices module

* Open sourced Post Incident Practices module * Open sourced Post Incident Practices module
dxc-technology · Dec 16, 2020 · 06f583c · 06f583c
1 parent 1cbe2ac
commit 06f583c
Show file tree

Hide file tree

Showing 39 changed files with 702 additions and 1 deletion.
diff --git a/CHANGELOG.md b/CHANGELOG.md
@@ -5,6 +5,12 @@ All notable changes to this project will be documented in this file.
 The format is based on [Keep a Changelog](https://keepachangelog.com/en/1.0.0/),
 and this project adheres to [Semantic Versioning](https://semver.org/spec/v2.0.0.html).
 
+## [1.6.0] - 2020-12-15
+
+### Added
+
+- Post Incident Practices Module
+
 ## [1.5.1] - 2020-12-13
 
 ### Changed

diff --git a/assets/online-devops-dojo/post-incident-practices/adam.png b/assets/online-devops-dojo/post-incident-practices/adam.png
diff --git a/assets/online-devops-dojo/post-incident-practices/brenda.png b/assets/online-devops-dojo/post-incident-practices/brenda.png
diff --git a/assets/online-devops-dojo/post-incident-practices/charlie.png b/assets/online-devops-dojo/post-incident-practices/charlie.png
diff --git a/assets/online-devops-dojo/post-incident-practices/chun.png b/assets/online-devops-dojo/post-incident-practices/chun.png
diff --git a/assets/online-devops-dojo/post-incident-practices/dan.png b/assets/online-devops-dojo/post-incident-practices/dan.png
diff --git a/assets/online-devops-dojo/post-incident-practices/paulo.png b/assets/online-devops-dojo/post-incident-practices/paulo.png
diff --git a/assets/online-devops-dojo/post-incident-practices/santhosh.png b/assets/online-devops-dojo/post-incident-practices/santhosh.png
diff --git a/assets/online-devops-dojo/post-incident-practices/selma.png b/assets/online-devops-dojo/post-incident-practices/selma.png
diff --git a/assets/online-devops-dojo/post-incident-practices/team-chat.jpg b/assets/online-devops-dojo/post-incident-practices/team-chat.jpg
diff --git a/assets/online-devops-dojo/post-incident-practices/tina.png b/assets/online-devops-dojo/post-incident-practices/tina.png
diff --git a/online-devops-dojo-pathway.json b/online-devops-dojo-pathway.json
@@ -36,6 +36,11 @@
       "title": "DevOps Kaizen",
       "description": "DevOps Kaizen events can be used to help you and your team continuously improve your development processes.",
       "course_id": "devops-kaizen"
+    },
+    {
+      "title": "Post Incident Practices",
+      "description": "Learn how to establish a Safety Culture, conduct blameless post-mortems and create a “code of conduct” to help your team manage incidents in all phases of their life cycle",
+      "course_id": "post-incident-practices"
     }
   ]
 }
diff --git a/online-devops-dojo/post-incident-practices/assets/dialog.py b/online-devops-dojo/post-incident-practices/assets/dialog.py
@@ -0,0 +1,156 @@
+#!/usr/bin/python3
+# Python helper to simulate a dialog
+# This Source Code Form is subject to the terms of the Mozilla Public
+# License, v. 2.0. If a copy of the MPL was not distributed with this
+# file, You can obtain one at https://mozilla.org/MPL/2.0/.
+
+import os, sys, time, yaml, random, curses, re, textwrap
+
+firstname=""
+# Command line argument: dialog as yaml file
+dialog_file = sys.argv[1]
+
+height=0
+width=0
+
+# Init color pairs
+def setcolors():
+    curses.init_pair(1, curses.COLOR_BLACK, curses.COLOR_BLUE) # Speaker name
+    curses.init_pair(2, 183, curses.COLOR_BLACK)
+    curses.init_pair(3, 119, curses.COLOR_BLACK)
+    curses.init_pair(4, 230, curses.COLOR_BLACK)
+    curses.init_pair(5, 229, curses.COLOR_BLACK)
+    curses.init_pair(6, 230, curses.COLOR_BLACK)
+    curses.init_pair(7, 200, curses.COLOR_BLACK)
+    curses.init_pair(8, 195, curses.COLOR_BLACK)
+    curses.init_pair(9, 233, curses.COLOR_BLACK)
+    curses.init_pair(10, 250, curses.COLOR_BLACK)
+    curses.init_pair(11, 251, curses.COLOR_BLACK)
+    curses.init_pair(12, 252, curses.COLOR_BLACK)
+    curses.init_pair(13, 253, curses.COLOR_BLACK)
+    curses.init_pair(14, 254, curses.COLOR_BLACK)
+    curses.init_pair(15, 255, curses.COLOR_BLACK)       # Prompts
+
+# Each speaker speaks with a different color, white by default
+def color(speaker):
+
+    return {
+        'paulo': 2,
+        'santhosh': 3,
+        'adam': 4,
+        'dan': 5,
+        'tina': 6,
+        'selma': 7,
+        'chun': 8,
+        'hal': 9,
+        'brenda': 10,
+        'charlie': 11,
+        'team': 12,
+    }.get(speaker.lower(),'2')
+
+# Type text like a human
+def human_type(stdscr, speaking, firstname, text, color, dialog_id):
+    # Test if we are at bottom of screen
+    text = text.replace("[student]", firstname)
+    lines = text.count("\n") + 1
+    y, x = stdscr.getyx()
+    height, width = stdscr.getmaxyx()
+    if y + lines + 2 > height:
+        wait_for_enter(stdscr, dialog_id)
+        stdscr.clear()
+    # Speaker
+    stdscr.addstr(speaking + ">", curses.color_pair(1))
+    stdscr.addstr(" ")
+    # do not wait for input when calling getch
+    stdscr.nodelay(1)
+    nodelay = False
+    for letter in text:
+        stdscr.refresh()
+        try:
+            stdscr.addch(letter, curses.color_pair(color))
+            k = stdscr.getch()
+            # Skip to end of sentence with 'n'
+            if k == 110:
+                nodelay = True
+        except curses.error: pass
+        if letter == ' ':
+            time.sleep(0 if nodelay else 0.07)
+        elif (letter == '.' or letter =='?' or letter ==','):
+            time.sleep(0 if nodelay else 0.15)
+        else:
+            time.sleep(0 if nodelay else random.choice([0.01, 0.02]))
+    stdscr.nodelay(0)
+
+def redraw_on_resize(stdscr, dialog_id):
+    stdscr.clear()
+    stdscr.refresh()
+    redraw_dialog = 0
+    dialogs2 = yaml.load_all(open(dialog_file, 'r'), Loader=yaml.FullLoader)
+    for dialog2 in dialogs2:
+        height, width = stdscr.getmaxyx()
+        stdscr.addstr(dialog2['speaking'] + ">", curses.color_pair(1))
+        stdscr.addstr(" ")
+        stdscr.addstr(textwrap.fill(dialog2['text'].replace("[student]", firstname), width - len(dialog2['speaking']) - 3), curses.color_pair(color(dialog2['speaking'])))
+        stdscr.addstr("\n")
+        redraw_dialog = redraw_dialog + 1
+        if redraw_dialog >= dialog_id:
+            stdscr.addstr("Press enter\n", curses.color_pair(15))
+            break
+
+def wait_for_enter(stdscr, dialog_id):
+    # Resize before waiting for enter
+    try:
+        stdscr.addstr("\nPress enter\n", curses.color_pair(15))
+    except curses.error: pass
+    if curses.is_term_resized(height, width):
+            redraw_on_resize(stdscr, dialog_id)
+    k = 0
+    while (k != 10) and (k != 110):
+        k = stdscr.getch()
+        if (k == curses.KEY_RESIZE):
+            # Resize if window resized waiting for enter
+            redraw_on_resize(stdscr, dialog_id)
+
+
+def main(stdscr):
+    setcolors()
+    # Get first name of the user
+    firstname = ""
+    try:
+        myfile = open('/tmp/firstname.txt', 'r')
+        firstname = myfile.read().replace('\n', '')
+    except IOError:
+        firstname = "the student"
+
+    # Open the dialog
+    try:
+        dialogs = yaml.load_all(open(dialog_file, 'r'))
+    except yaml.YAMLError as exc:
+        print(exc)
+
+    # Find terminal width
+    rows, columns = os.popen('stty size', 'r').read().split()
+
+    # Start the dialog
+    dialog_id = 0
+    for dialog in dialogs:
+        dialog_id = dialog_id + 1
+        stdscr.refresh()
+        height, width = stdscr.getmaxyx()   # Capture size before dialog
+        try:
+            human_type(stdscr, dialog['speaking'], firstname, textwrap.fill(dialog['text'], width - len(firstname) - 3), color(dialog['speaking']), dialog_id)
+        except curses.error:pass
+        wait_for_enter(stdscr, dialog_id)
+
+        y, x = stdscr.getyx()
+        stdscr.move(y-1,0)
+        stdscr.clrtoeol()
+        stdscr.refresh()
+
+    # We went through the end of the dialog
+    dialog_id = re.search(r'dialog(\d*).yaml', dialog_file)
+    if dialog_id:
+        open('/tmp/dialog' + dialog_id.group(1) + 'played.txt', 'w')
+
+if __name__ == '__main__':
+    curses.wrapper(main)
diff --git a/online-devops-dojo/post-incident-practices/assets/dialog1.sh b/online-devops-dojo/post-incident-practices/assets/dialog1.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# clear the screen
+clear
+
+#play the dialog
+TERM=xterm-256color python3 dialog.py dialog1.yaml
+
+echo "Click on 'CONTINUE'."
diff --git a/online-devops-dojo/post-incident-practices/assets/dialog1.yaml b/online-devops-dojo/post-incident-practices/assets/dialog1.yaml
@@ -0,0 +1,72 @@
+---
+speaking: Chun
+text: >
+    Let me start by focusing on what was done well in relation to the incident post-mortem because it is important those practices continue.
+---
+speaking: Chun
+text: >
+    The post-mortem was scheduled immediately after the incident was resolved which is as per best practices.
+---
+speaking: Santhosh
+text: >
+    What else are we doing well?
+---
+speaking: Paulo
+text: >
+    From what we saw, you had all the right people in the meeting. Representatives from the teams who developed, tested, and deployed the application. 
+---
+speaking: Paulo
+text: >
+    The people who reported the outage, identified the problem, found the root cause of the issue, and the people who resolved the issue.
+---
+speaking: Chun
+text: >
+    You created a timeline of the outage and started to get people's perspectives on the events that led to the outage and the outage itself.
+---
+speaking: Santhosh
+text: >
+    Good to hear but I think we all know what you are going to say next - at that point it all went off the rails and it degenerated into a finger pointing exercise.
+---
+speaking: Paulo
+text: >
+    Exactly.
+---
+speaking: Chun
+text: >
+    High performance teams create a Safety Culture where people are empowered to act, expected to act, and rewarded for taking smart action.
+---
+speaking: Chun
+text: >
+    Where mistakes are not punished, so that people give open and frank accounts of their actions which may have contributed to the outage.
+---
+speaking: Santhosh
+text: >
+    And how does your suggestion for a Safety Culture apply to the post incident post-mortem?
+---
+speaking: Chun
+text: >
+    The postmortem should be blameless and be focused on learning as much as possible from an event or outage. It should be used to instill a culture of action in the team.
+---
+speaking: Paulo
+text: >
+    OK so how might have the post-mortem we held yesterday played out if it was held as a blameless post-mortem in a Safety Culture?
+---
+speaking: Chun
+text: >
+    Interesting question, I will need some volunteers to participate in a little role playing exercise to explain.
+---
+speaking: Team
+text: >
+    Groans and starts looking at their feet.
+---
+speaking: Chun
+text: >
+    Laughs and tells everyone not to be shy - the exercise will be fun.
+---
+speaking: Team
+text: >
+    Of course it will.
+---
+speaking: Chun
+text: >
+    First, let's ask some questions to [student].
diff --git a/online-devops-dojo/post-incident-practices/assets/dialog2.sh b/online-devops-dojo/post-incident-practices/assets/dialog2.sh
@@ -0,0 +1,9 @@
+#!/bin/bash
+
+# clear the screen
+clear
+
+#play the dialog
+TERM=xterm-256color python3 dialog.py dialog2.yaml
+
+echo "Click on 'CONTINUE'."
diff --git a/online-devops-dojo/post-incident-practices/assets/dialog2.yaml b/online-devops-dojo/post-incident-practices/assets/dialog2.yaml
@@ -0,0 +1,112 @@
+---
+speaking: Paulo
+text: >
+    Thank you all for joining us today, I think we have everyone we need for this blameless post-mortem.
+---
+speaking: Santhosh
+text: >
+    We do, we have representatives from the teams who developed, tested, and deployed the application.
+---
+speaking: Santhosh
+text: >
+    We have the people who reported the outage, identified the problem, found the root cause of the issue, and the people who resolved the issue.
+---
+speaking: Paulo
+text: >
+    Great, and we have the timeline of events for the outage, when it was detected, reported, responded to, and resolved? 
+---
+speaking: Santhosh
+text: >
+    We do, so how do you want to proceed?
+---
+speaking: Paulo
+text: >
+    I say we dive right in, but first let me share a few ground rules, this is a blameless post-mortem so it will be focused on learning as much as possible from an event or outage. This lets us take appropriate remediation actions to prevent issues of this type happening again. Chun would you like to add something?
+---
+speaking: Chun
+text: >
+    Sure, the purpose of the exercise is to focus on the why rather than the who. We are not here to attribute blame, so I would ask people to be as open and transparent as possible.
+---
+speaking: Paulo
+text: >
+    Thanks Chun. Who wants to start us off?
+---
+speaking: Dan
+text: >
+    Adam, can we get the application logs for the weekend the incident occurred?
+---
+speaking: Adam
+text: >
+    I could not find the logs. The incident was a week ago and detailed logs must have been recycled
+---
+speaking: Dan
+text: >
+    Can we keep the logs for longer?
+---
+speaking: Adam
+text: >
+    Yes we can, but we need you guys in the development team to make the log retention period configurable in the application.
+---
+speaking: Dan
+text: >
+    OK, we can do that.
+---
+speaking: Paulo
+text: >
+    How come the first we heard of this outage was from impacted customers?
+---
+speaking: Adam
+text: >
+    Unfortunately we did not have monitoring, or alarms configured on those servers.
+---
+speaking: Paulo
+text: >
+    That is not ideal. What can we do to resolve those issues?
+---
+speaking: Adam
+text: >
+    We can set some thresholds for the appropriate variables then configure monitoring and alerting to ensure we are informed when they are exceeded.
+---
+speaking: Paulo
+text: >
+    OK, let's agree to proceed on that basis.
+---
+speaking: Santhosh
+text: >
+    While the lack of logs, monitoring, and alarms did not help with the time to resolve the issue, there was still an underlying bug in the application. 
+---
+speaking: Santhosh
+text: >
+    Dan/Tina can we do something to test for the bugs of this nature prior to releasing and deploying the application?
+---
+speaking: Tina
+text: >
+    Sure we can put some automated tests in place.
+---
+speaking: Dan
+text: >
+    We could augment the tests by peer code reviews to help ensure that a change of this nature does not cause issues again.
+---
+speaking: Santhosh
+text: >
+    Thanks guys, we have a plan.
+---
+speaking: Santhosh
+text: >
+    I will work with Paulo to create and prioritize the remediation stories in the backlog to cover what we just discussed.
+---
+speaking: Chun
+text: >
+    See how the tone and nature of the conversation changed? I know the conversation is a little contrived but hopefully you guys got the essential message.
+---
+speaking: Chun
+text: >
+    The priorities in a blameless post-mortem are the why and what can be done, in the system, to avoid the same issue from occuring. 
+---
+speaking: Paulo
+text: >
+    Thanks Chun, it may take us a while to get to there, but I think we all understand what you are saying.
+---
+speaking: Paulo
+text: >
+    What would help is if you could share some tips with the team on how to conduct a blameless post-mortem in terms of best practices including any metrics we should be focusing on?