Sfoglia il codice sorgente

HPCC-27048 Post-mortem debug ability in cloud

Initial design for post-mortem support

Signed-off-by: Richard Chapman <rchapman@hpccsystems.com>
Richard Chapman 3 anni fa
parent
commit
9402989e3c

+ 1 - 0
common/workunit/workunit.cpp

@@ -10148,6 +10148,7 @@ EnumMapping queryFileTypes[] = {
    { FileTypeHintXml, "hint" },
    { FileTypeXml, "xml" },
    { FileTypeLog, "log" },
+   { FileTypePostMortem, "postmortem" },
    { FileTypeSize,  NULL },
 };
 

+ 3 - 2
common/workunit/workunit.hpp

@@ -359,10 +359,11 @@ enum WUFileType
     FileTypeHintXml = 3,
     FileTypeXml = 4,
     FileTypeLog = 5,
-    FileTypeSize = 6
+    FileTypePostMortem = 6,
+    FileTypeSize = 7
 };
 
-
+extern WORKUNIT_API EnumMapping queryFileTypes[];
 
 
 interface IConstWUAssociatedFile : extends IInterface

+ 1 - 0
dockerfiles/platform-core-debug/Dockerfile

@@ -35,6 +35,7 @@ ENV PATH="/opt/HPCCSystems/bin:${PATH}"
 ENV HPCC_containerized=1
 
 USER hpcc
+RUN cp /opt/HPCCSystems/bin/.gdbinit ~/.gdbinit
 WORKDIR /var/lib/HPCCSystems
 ARG BUILD_TAG_OVERRIDE
 ENV HPCC_BUILD_TAG=${BUILD_TAG_OVERRIDE}

+ 2 - 0
dockerfiles/platform-core/Dockerfile

@@ -56,6 +56,7 @@ RUN if [ ${USE_CPPUNIT} -eq 1 ] ; then apt-get install -y libcppunit-1.15-0 ; fi
 
 RUN apt-get install -y \
     dnsutils \
+    gdb \
     nano 
 
 # Set the locale
@@ -80,6 +81,7 @@ RUN mkdir /var/lock/HPCCSystems && chown hpcc:hpcc /var/lock/HPCCSystems
 RUN mkdir /var/run/HPCCSystems && chown hpcc:hpcc /var/run/HPCCSystems
 
 USER hpcc
+RUN cp /opt/HPCCSystems/bin/.gdbinit ~/.gdbinit
 ENV PATH="/opt/HPCCSystems/bin:${PATH}"
 ENV HPCC_containerized=1
 WORKDIR /var/lib/HPCCSystems

+ 6 - 0
dockerfiles/startall.sh

@@ -129,6 +129,12 @@ while [ "$#" -gt 0 ]; do
   shift
 done
 
+if [[ "${CMD}" = "upgrade" ]]; then
+  if [[ $(helm list -q -f $CLUSTERNAME) != $CLUSTERNAME ]]; then
+    echo "Requested installation for upgrade does not exist - assuming install"
+    CMD="install"
+  fi
+fi
 [[ -n ${INPUT_DOCKER_REPO} ]] && DOCKER_REPO=${INPUT_DOCKER_REPO}
 [[ -z ${LABEL} ]] && LABEL=$(docker image ls | fgrep "${DOCKER_REPO}/platform-core" | head -n 1 | awk '{print $2}')
 

+ 0 - 10
esp/services/ws_workunits/ws_workunitsHelpers.cpp

@@ -504,16 +504,6 @@ unsigned WsWuInfo::getTimerCount()
     return visitor.getNumTimers();
 }
 
-EnumMapping queryFileTypes[] = {
-   { FileTypeCpp, "cpp" },
-   { FileTypeDll, "dll" },
-   { FileTypeResText, "res" },
-   { FileTypeHintXml, "hint" },
-   { FileTypeXml, "xml" },
-   { FileTypeLog, "log" },
-   { FileTypeSize,  NULL },
-};
-
 void WsWuInfo::getHelpers(IEspECLWorkunit &info, unsigned long flags)
 {
     try

+ 5 - 0
helm/examples/local/hpcc-localfile/values.yaml

@@ -21,6 +21,11 @@ planes:
   size: 1Gi
   rwmany: true
   category: sasha
+- name: debug
+  subPath: debug
+  size: 5Gi
+  rwmany: true
+  category: debug
 - name: data
   subPath: hpcc-data # cannot currently be changed
   size: 3Gi

+ 71 - 45
helm/hpcc/templates/_helpers.tpl

@@ -79,12 +79,14 @@ Pass in dict with root, category
 {{- $planes := ($storage.planes | default list) -}}
 {{- $firstPlane := dict -}}
 {{- range $plane := $planes -}}
+{{- if not $plane.disabled -}}
 {{- if not $firstPlane.plane -}}
 {{- if (eq $category $plane.category) -}}
 {{- $_ := set $firstPlane "plane" $plane.name -}}
 {{- end -}}
 {{- end -}}
 {{- end -}}
+{{- end -}}
 {{- if $firstPlane.plane -}}
 {{- $firstPlane.plane -}}
 {{- end -}}
@@ -135,6 +137,16 @@ Get default git plane
 {{- include "hpcc.getFirstPlaneForCategory" (dict "root" $ "category" "git") | default (include "hpcc.getFirstPlaneForCategory" (dict "root" $ "category" "dll")) -}}
 {{- end -}}
 
+{{- define "hpcc.printDebugEnvironment" -}}
+{{- $debugPlane := .me.debugPlane | default (include "hpcc.getFirstPlaneForCategory"  (dict "root" .root "category" "debug")) -}}
+{{- if $debugPlane -}}
+ {{- include "hpcc.checkPlaneExists" (dict "root" .root "planeName" $debugPlane) -}}
+ {{- $prefix := include "hpcc.getPlanePrefix" (dict "root" .root "planeName" $debugPlane) -}}
+- name: HPCC_DEBUG_PATH
+  value: {{ $prefix }}
+ {{- end -}}
+{{- end -}}
+
 {{/*
 Returns the largest number of workers from all the thors
 */}}
@@ -210,15 +222,17 @@ storage:
   planes:
 {{- /*Generate entries for each data plane (removing the pvc).  Exclude the planes used for dlls and dali.*/ -}}
 {{- range $plane := $planes }}
+ {{- if not $plane.disabled }}
   - name: {{ $plane.name | quote }}
- {{- $planeYaml := omit $plane "name" "pvc" "storageClass" "storageSize" "subPath" -}}
- {{- if $plane.subPath -}}
-  {{- $_ := set $planeYaml "prefix" (printf "%s/%s" $planeYaml.prefix $plane.subPath) -}}
- {{- end -}}
- {{- if and (eq "data" $plane.category) (not $plane.defaultSprayParts) -}}
-  {{- $_ := set $planeYaml "defaultSprayParts" (include "hpcc.getMaxNumWorkers" $ | int) -}}
- {{- end -}}
- {{- toYaml $planeYaml | nindent 4 }}
+  {{- $planeYaml := omit $plane "name" "pvc" "storageClass" "storageSize" "subPath" -}}
+  {{- if $plane.subPath -}}
+   {{- $_ := set $planeYaml "prefix" (printf "%s/%s" $planeYaml.prefix $plane.subPath) -}}
+  {{- end -}}
+  {{- if and (eq "data" $plane.category) (not $plane.defaultSprayParts) -}}
+   {{- $_ := set $planeYaml "defaultSprayParts" (include "hpcc.getMaxNumWorkers" $ | int) -}}
+  {{- end -}}
+  {{- toYaml $planeYaml | nindent 4 }}
+ {{- end }}
 {{- end }}
 {{- if not (include "hpcc.hasPlaneForCategory" (dict "root" $ "category" "spill")) }}
   - name: hpcc-spill-plane
@@ -306,22 +320,24 @@ to addVolumeMounts so that if a plane can be used for multiple purposes then dup
 {{- $includeNames := .includeNames | default list -}}
 {{- $previousMounts := dict -}}
 {{- range $plane := $planes -}}
- {{- if or ($plane.pvc) (hasKey $plane "storageClass") -}}
-  {{- if not (hasKey $previousMounts $plane.prefix) -}}
-   {{- $mountpath := $plane.prefix -}}
-   {{- if or (has $plane.category $includeCategories) (has $plane.name $includeNames) }}
-    {{- $num := int ( $plane.numDevices | default 1 ) -}}
-    {{- if le $num 1 }}
+ {{- if not $plane.disabled -}}
+  {{- if or ($plane.pvc) (hasKey $plane "storageClass") -}}
+   {{- if not (hasKey $previousMounts $plane.prefix) -}}
+    {{- $mountpath := $plane.prefix -}}
+    {{- if or (has $plane.category $includeCategories) (has $plane.name $includeNames) }}
+     {{- $num := int ( $plane.numDevices | default 1 ) -}}
+     {{- if le $num 1 }}
 - name: {{ lower $plane.name }}-pv
   mountPath: {{ $mountpath | quote }}
-    {{- else }}
-     {{- range $elem := untilStep 1 (int (add $num 1)) 1 }}
+     {{- else }}
+      {{- range $elem := untilStep 1 (int (add $num 1)) 1 }}
 - name: {{ lower $plane.name }}-pv-many-{{- $elem }}
   mountPath: {{ printf "%s/d%d" $mountpath $elem | quote }}
+      {{- end }}
      {{- end }}
     {{- end }}
+    {{- $_ := set $previousMounts $plane.prefix true -}}
    {{- end }}
-   {{- $_ := set $previousMounts $plane.prefix true -}}
   {{- end }}
  {{- end }}
 {{- end }}
@@ -340,25 +356,27 @@ The plane will generate a volume if it matches either an includeLabel or an incl
 {{- $includeNames := .includeNames | default list -}}
 {{- $previousMounts := dict -}}
 {{- range $plane := $planes -}}
- {{- if or ($plane.pvc) (hasKey $plane "storageClass") -}}
-  {{- if not (hasKey $previousMounts $plane.prefix) -}}
-   {{- $mountpath := $plane.prefix -}}
-   {{- if or (has $plane.category $includeCategories) (has $plane.name $includeNames) }}
-    {{- $pvc := hasKey $plane "pvc" | ternary $plane.pvc (printf "%s-%s-pvc" (include "hpcc.fullname" $) $plane.name) -}}
-    {{- $num := int ( $plane.numDevices | default 1 ) -}}
-    {{- if le $num 1 }}
+ {{- if not $plane.disabled -}}
+  {{- if or ($plane.pvc) (hasKey $plane "storageClass") -}}
+   {{- if not (hasKey $previousMounts $plane.prefix) -}}
+    {{- $mountpath := $plane.prefix -}}
+    {{- if or (has $plane.category $includeCategories) (has $plane.name $includeNames) }}
+     {{- $pvc := hasKey $plane "pvc" | ternary $plane.pvc (printf "%s-%s-pvc" (include "hpcc.fullname" $) $plane.name) -}}
+     {{- $num := int ( $plane.numDevices | default 1 ) -}}
+     {{- if le $num 1 }}
 - name: {{ lower $plane.name }}-pv
   persistentVolumeClaim:
     claimName: {{ $pvc }}
-    {{- else }}
-     {{- range $elem := until $num }}
+     {{- else }}
+      {{- range $elem := until $num }}
 - name: {{ lower $plane.name }}-pv-many-{{- add $elem 1 }}
   persistentVolumeClaim:
     claimName: {{ $pvc }}-{{- add $elem 1 }}
-     {{- end }}
-    {{- end -}}
+      {{- end }}
+     {{- end -}}
+    {{- end }}
+    {{- $_ := set $previousMounts $plane.prefix true -}}
    {{- end }}
-   {{- $_ := set $previousMounts $plane.prefix true -}}
   {{- end }}
  {{- end }}
 {{- end -}}
@@ -374,8 +392,10 @@ Pass in dict with root, planeName
 {{- $name := .planeName -}}
 {{- $matched := dict -}}
 {{- range $plane := $planes -}}
- {{- if (eq $plane.name $name) -}}
-  {{- $_ := set $matched "ok" true -}}
+ {{- if not $plane.disabled -}}
+  {{- if (eq $plane.name $name) -}}
+   {{- $_ := set $matched "ok" true -}}
+  {{- end -}}
  {{- end -}}
 {{- end -}}
 {{- if not $matched.ok -}}
@@ -514,11 +534,13 @@ Check whether a storage plane is defined or not.
 {{- $planes := ($storage.planes | default list) -}}
 {{- $done := dict -}}
 {{- range $plane := $planes -}}
- {{- if eq $category $plane.category -}}
-  {{- if eq $search $plane.name -}}
-   {{- $_ := set $done "matched" true -}}
+ {{- if not $plane.disabled -}}
+  {{- if eq $category $plane.category -}}
+   {{- if eq $search $plane.name -}}
+    {{- $_ := set $done "matched" true -}}
+   {{- end -}}
+   {{- $_ := set $done "all" ( printf "%s \"%s\"" $done.all $plane.name) -}}
   {{- end -}}
-  {{- $_ := set $done "all" ( printf "%s \"%s\"" $done.all $plane.name) -}}
  {{- end -}}
 {{- end -}}
 {{- if not $done.matched -}}
@@ -631,11 +653,13 @@ NB: uid=10000 and gid=10001 are the uid/gid of the hpcc user, built into platfor
 {{- $includeCategories := .includeCategories | default list -}}
 {{- $includeNames := .includeNames | default list -}}
 {{- range $plane := $planes -}}
- {{- if and ($plane.forcePermissions) (or ($plane.pvc) (hasKey $plane "storageClass")) -}}
-  {{- $mountpath := $plane.prefix -}}
-  {{- if or (has $plane.category $includeCategories) (has $plane.name $includeNames) -}}
-   {{- $volumeName := (printf "%s-pv" $plane.name) -}}
+ {{- if not $plane.disabled -}}
+  {{- if and ($plane.forcePermissions) (or ($plane.pvc) (hasKey $plane "storageClass")) -}}
+   {{- $mountpath := $plane.prefix -}}
+   {{- if or (has $plane.category $includeCategories) (has $plane.name $includeNames) -}}
+    {{- $volumeName := (printf "%s-pv" $plane.name) -}}
    {{- include "hpcc.changeMountPerms" (dict "root" .root "uid" $uid "gid" $gid "volumeName" $volumeName "volumePath" $plane.prefix) | nindent 0 }}
+   {{- end -}}
   {{- end -}}
  {{- end -}}
 {{- end -}}
@@ -1146,12 +1170,14 @@ Pass in dict with root, category.  optional name to restrict it to a single name
 {{- $planes := ($storage.planes | default list) -}}
 {{- $previousMounts := dict -}}
 {{- range $plane := $planes -}}
- {{- if (hasKey $plane "storageClass") -}}
-  {{- if not (hasKey $previousMounts $plane.prefix) -}}
-   {{- $pvcname := (printf "%s-pvc" $plane.name) -}}
-   {{- include "hpcc.addPVC" (dict "root" $ "name" $pvcname "me" $plane) }}
-   {{- $_ := set $previousMounts $plane.prefix true -}}
- {{- end }}
+ {{- if not $plane.disabled -}}
+  {{- if (hasKey $plane "storageClass") -}}
+   {{- if not (hasKey $previousMounts $plane.prefix) -}}
+    {{- $pvcname := (printf "%s-pvc" $plane.name) -}}
+    {{- include "hpcc.addPVC" (dict "root" $ "name" $pvcname "me" $plane) }}
+    {{- $_ := set $previousMounts $plane.prefix true -}}
+   {{- end }}
+  {{- end }}
  {{- end }}
 {{- end }}
 {{- end -}}

+ 10 - 1
helm/hpcc/templates/roxie.yaml

@@ -82,7 +82,7 @@ data:
 {{- $env := concat ($.Values.global.env | default list) (.env | default list) -}}
 {{- $secretsCategories := list "system" "ecl-user" "ecl" "storage" }}
 {{- $toposerver := ($roxie.topoServer | default dict) -}}
-{{- $commonCtx := dict "root" $ "me" $roxie "includeCategories" (list "lz" "data" "remote" "spill" "dll") "secretsCategories" $secretsCategories "toposerver" $toposerver "env" $env }}
+{{- $commonCtx := dict "root" $ "me" $roxie "includeCategories" (list "lz" "data" "remote" "spill" "dll" "debug") "secretsCategories" $secretsCategories "toposerver" $toposerver "env" $env }}
 {{- $_ := set $commonCtx "toponame" (printf "%s-toposerver" $roxie.name) -}}
 {{- $_ := set $commonCtx "numChannels" ($roxie.numChannels | int | default 1) -}}
 {{- $_ := set $commonCtx "topoport" ($toposerver.port | int | default 9004) -}}
@@ -381,15 +381,24 @@ spec:
                 "--log-fd=1",
                 "roxie",
 {{- else }}
+{{- if (include "hpcc.hasPlaneForCategory" (dict "root" $ "category" "debug")) }}
+        command: [ check_executes ] 
+        args: [
+                "-f", "/etc/config/{{ $roxie.name }}.yaml",
+                "--",
+                "roxie",
+{{- else }}
         command: [ roxie ] 
         args: [
 {{- end }}
+{{- end }}
                 {{ include "hpcc.configArg" $roxie }},
                 {{ include "hpcc.daliArg" $ }},
                 "--channels={{ $channel }}", 
                 "--server={{ not $roxie.serverReplicas }}",
               ]
         env:
+{{ include "hpcc.printDebugEnvironment" $commonCtx | indent 8 }}
 {{ include "hpcc.mergeEnvironments" $env | indent 8 -}}
         - name: "SENTINEL"
           value: "/tmp/{{ $roxie.name }}.sentinel"

+ 16 - 14
helm/hpcc/templates/storage.yaml

@@ -27,22 +27,24 @@
 {{- $previousPlanes := dict -}}
 {{- $previousMounts := dict -}}
 {{- range $plane := $planes -}}
- {{- if (ne (lower $plane.name) $plane.name) -}}
-  {{- required (printf "Name of storage plane '%s' must not contain upper case" $plane.name) nil -}}
- {{- end -}}
- {{- if hasKey $previousPlanes $plane.name -}}
-  {{- required (printf "Storage plane '%s' is defined more than once" $plane.name) nil -}}
- {{- end -}}
- {{- $_ := set $previousPlanes $plane.name true -}}
- {{- $rawPlane := omit $plane "name" "category" "subPath" -}}
- {{- if and (not (hasKey $plane "hosts")) (not (hasKey $plane "hostGroup")) -}}
-  {{- if hasKey $previousMounts $plane.prefix -}}
-   {{- /* Should this restrict if has a pvc or a storage class? */ -}}
-   {{- if not (deepEqual (get $previousMounts $plane.prefix) $rawPlane) -}}
-    {{- required (printf "Multiple incompatible planes refer to prefix '%s'" $plane.prefix) nil -}}
+ {{- if not .disabled -}}
+  {{- if (ne (lower $plane.name) $plane.name) -}}
+   {{- required (printf "Name of storage plane '%s' must not contain upper case" $plane.name) nil -}}
+  {{- end -}}
+  {{- if hasKey $previousPlanes $plane.name -}}
+   {{- required (printf "Storage plane '%s' is defined more than once" $plane.name) nil -}}
+  {{- end -}}
+  {{- $_ := set $previousPlanes $plane.name true -}}
+  {{- $rawPlane := omit $plane "name" "category" "subPath" -}}
+  {{- if and (not (hasKey $plane "hosts")) (not (hasKey $plane "hostGroup")) -}}
+   {{- if hasKey $previousMounts $plane.prefix -}}
+    {{- /* Should this restrict if has a pvc or a storage class? */ -}}
+    {{- if not (deepEqual (get $previousMounts $plane.prefix) $rawPlane) -}}
+     {{- required (printf "Multiple incompatible planes refer to prefix '%s'" $plane.prefix) nil -}}
+    {{- end -}}
    {{- end -}}
+   {{- $_ := set $previousMounts $plane.prefix $rawPlane -}}
   {{- end -}}
-  {{- $_ := set $previousMounts $plane.prefix $rawPlane -}}
  {{- end -}}
 {{- end -}}
 {{- /* check that planes exist for each of the required planes */ -}}

+ 5 - 1
helm/hpcc/values.schema.json

@@ -449,6 +449,10 @@
           "description": "the name of the storage plane",
           "type": "string"
         },
+        "disabled": {
+          "description": "disable this plane definition",
+          "type": "boolean"
+        },
         "prefix": {
           "description": "either the path for a local mount, or the url prefix",
           "type": "string"
@@ -490,7 +494,7 @@
         "category": {
           "description": "the category this plane is usd for, e.g. lz, data",
           "type": "string",
-          "enum": ["data", "lz", "dali", "sasha", "dll", "spill", "temp", "git", "remote" ]
+          "enum": ["data", "lz", "dali", "sasha", "dll", "spill", "temp", "git", "remote", "debug" ]
         },
         "umask" : {
           "description": "file creation mask (used by despray)",

+ 6 - 0
helm/hpcc/values.yaml

@@ -191,6 +191,12 @@ storage:
     storageSize: 1Gi
     prefix: "/var/lib/HPCCSystems/mydropzone"
     category: lz
+  - name: debug
+    disabled: False
+    storageClass: ""
+    storageSize: 1Gi
+    prefix: "/var/lib/HPCCSystems/debug"
+    category: debug
 
 ## The certificates section can be used to enable cert-manager to generate TLS certificates for each component in the hpcc.
 ## You must first install cert-manager to use this feature.

+ 6 - 1
initfiles/CMakeLists.txt

@@ -39,7 +39,12 @@ if ( PLATFORM AND UNIX )
     configure_file("${CMAKE_CURRENT_SOURCE_DIR}/bash-vars.in" "${CMAKE_BINARY_DIR}/bash-vars")
     set(bash-vars "${CMAKE_BINARY_DIR}/bash-vars")
 
-    if ( NOT CONTAINERIZED )
+    if ( CONTAINERIZED )
+      install ( FILES bin/.gdbinit DESTINATION ${EXEC_DIR} COMPONENT Runtime )
+      install ( FILES bin/post-mortem-gdb DESTINATION ${EXEC_DIR} COMPONENT Runtime )
+      install ( FILES lib/libjlib.so-gdb.py DESTINATION ${LIB_DIR} COMPONENT Runtime )
+      install ( PROGRAMS bin/check_executes DESTINATION ${EXEC_DIR} COMPONENT Runtime )
+    else ()
       ADD_SUBDIRECTORY(etc)
       ADD_SUBDIRECTORY(bash)
       ADD_SUBDIRECTORY(bin)

+ 3 - 0
initfiles/bin/.gdbinit

@@ -0,0 +1,3 @@
+# These commands will be executed by gdb on startup
+add-auto-load-safe-path /opt/HPCCSystems/lib/libjlib.so-gdb.py
+set print object 1

+ 92 - 0
initfiles/bin/check_executes

@@ -0,0 +1,92 @@
+#!/bin/bash
+
+usage() {
+  echo "Usage: check-executes [options] -- cmd args"
+  echo "    -d <directory>     Mounted directory to store post-mortem info in"
+  echo "    -f <file>          Specifies a file to preserve on post-mortem"
+}
+
+PMD_DIRECTORYBASE=${HPCC_DEBUG_PATH}
+[[ -z ${PMD_DIRECTORYBASE} ]] && PMD_DIRECTORYBASE=$(pwd)
+PMD_PROGNAME=
+PMD_COPYFILES=()
+PMD_DALISERVER=
+PMD_WORKUNIT=
+
+while [ "$#" -gt 0 ]; do
+  arg=$1
+  if [[ ${arg:0:1} == '-' ]]; then
+    case "${arg:1:1}" in
+      -) shift
+         PROGNAME=$1
+         shift
+         break
+         ;;
+      d) shift;
+         PMD_DIRECTORYBASE=$1
+         ;;
+      f) shift;
+         PMD_COPYFILES+=($1)
+         ;;
+      *) usage
+         exit
+         ;;
+    esac
+  else
+    usage
+    exit
+  fi
+  shift
+done
+
+if [[ -z $PMD_PROGNAME} ]] ; then
+  usage
+  exit
+fi
+
+# Scan managed process parameters for additional information
+for (( arg=1; arg <= "$#"; arg++ )); do
+  optname=${!arg%=*}
+  optval=${!arg#*=}
+  if [[ ${optname} == '--config' ]]; then
+    PMD_COPYFILES+=(${optval})
+  elif [[] ${optname} == '--daliServer' ]]; then
+    PMD_DALISERVER=${optval}
+  elif [[] ${optname} == '--workunit' ]]; then
+    PMD_WORKUNIT=${optval}
+  fi
+done
+
+ulimit -c unlimited
+
+# Execute the main program, defaulting postmortem logging on (can be overriden by program's config file)
+${PROGNAME} --logging.postMortem=1000 "$@" 
+
+# If it did not exit cleanly, copy some post-mortem info
+retVal=$?
+if [ $retVal -ne 0 ]; then
+  POST_MORTEM_DIR=${PMD_DIRECTORYBASE}/$(hostname)/$(date -Iseconds)
+  mkdir -p ${POST_MORTEM_DIR}
+  echo "Post-mortem info gathered in $POST_MORTEM_DIR"
+  echo "Process exited with code $retVal" | tee $POST_MORTEM_DIR/info.log
+  for f in ${PMD_COPYFILES[@]}; do
+    mkdir -p $POST_MORTEM_DIR/$(dirname $f)  
+    cp $f $POST_MORTEM_DIR/$f
+    echo "Copied $f to $POST_MORTEM_DIR/$f" | tee -a $POST_MORTEM_DIR/info.log
+  done
+  cp `ls -rt /tmp/postmortem.log.*` $POST_MORTEM_DIR
+  rm /tmp/postmortem.log.*
+  if [ -f core ]; then
+    echo "Generating info from core file to $POST_MORTEM_DIR/info.log" | tee -a $POST_MORTEM_DIR/info.log
+    gdb -batch -x /opt/HPCCSystems/bin/post-mortem-gdb ${PROGNAME} core 2>$POST_MORTEM_DIR/info.err >>$POST_MORTEM_DIR/info.log
+    echo "Generated info from core file" | tee -a $POST_MORTEM_DIR/info.log
+    rm core
+  fi
+  if [[ -n "${PMD_DALISERVER}" ]] && [[ -n "${PMD_WORKUNIT}" ]]; then
+    wutool postmortem ${PMD_WORKUNIT} DALISERVERS=${PMD_DALISERVER} PMD=${POST_MORTEM_DIR}
+    echo Updated workunit ${PMD_WORKUNIT}
+  fi
+fi
+exit $retVal
+
+

+ 16 - 0
initfiles/bin/post-mortem-gdb

@@ -0,0 +1,16 @@
+# These commands will be executed by gdb to extract post-mortem debug information from a corefile
+# These commands define what information is gathered
+
+set print object 1
+info registers
+info threads
+info shared
+echo
+echo =========== Full thread info =====================
+echo
+thread apply all stack-info 20 5
+echo
+echo =========== Global variables =====================
+echo
+all-globals
+

+ 209 - 0
initfiles/lib/libjlib.so-gdb.py

@@ -0,0 +1,209 @@
+import gdb
+import re
+import gdb.printing
+
+class StringBufferPrinter:
+    """Print a StringBuffer object."""
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        len = int(self.val['curLen'])
+        if len:
+          return self.val['buffer'].string()[0:len]
+        else:
+          return ""
+    def display_hint(self):
+        return 'string'
+
+class StringAttrPrinter:
+    """Print a StringAttr object."""
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        return self.val['text']
+    def display_hint(self):
+        return 'string'
+
+class AtomicBoolPrinter:
+    """Print an atomic bool object."""
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        return self.val['_M_base']['_M_i']
+
+class AtomicScalarPrinter:
+    """Print an atomic scalar object."""
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        return self.val['_M_i']
+
+class AtomicVectorPrinter:
+    """Print an atomic pointer object."""
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        return self.val['_M_b']['_M_p']
+
+class AtomPrinter:
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        return self.val['key']
+    def display_hint(self):
+        return 'string'
+
+class CriticalSectionPrinter:
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+        return "CriticalSection with owner=%s, depth=%d" % (self.val['owner'], self.val['depth'])
+
+class IInterfacePrinter:
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+      return str(self.val.dynamic_type)
+
+class CInterfacePrinter:
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+      return str(self.val.dynamic_type) + " xxcount=" + str(self.val['xxcount'])
+
+class OwnedPrinter:
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+      return str(self.val['ptr'])
+
+class MapStringToMyClassPrinter:
+    def __init__(self, val):
+        self.val = val
+    def to_string(self):
+      cache = int(self.val['cache'])
+      table = str(self.val['table'])
+      tablesize = int(self.val['tablesize'])
+      tablecount = int(self.val['tablecount'])
+      keysize = int(self.val['keysize'])
+      ignorecase = bool(self.val['ignorecase'])
+      return "cache=%u table=%s tablesize=%r tablecount=%r keysize=%r ignorecase=%r" % (cache, table, tablesize, tablecount, keysize, ignorecase)
+
+def build_pretty_printer():
+    pp = gdb.printing.RegexpCollectionPrettyPrinter(
+        "HPCC-Platform/jlib")
+    pp.add_printer('StringBuffer', '^[V]?StringBuffer$', StringBufferPrinter)
+    pp.add_printer('StringAttr', '^StringAttr$', StringAttrPrinter)
+    pp.add_printer('std::atomic<bool>', '^std::atomic<bool>$', AtomicBoolPrinter)
+    pp.add_printer('std::atomic<int>', '^(RelaxedA|std::a)tomic<(unsigned int|int)>$', AtomicScalarPrinter)
+    pp.add_printer('std::atomic<ptr>', '^std::atomic<.*\*>$', AtomicVectorPrinter)
+    pp.add_printer('Atom', '^(LowerCase)?Atom$', AtomPrinter)
+    pp.add_printer('CriticalSection', '^CriticalSection$', CriticalSectionPrinter)
+    pp.add_printer('IInterfacePrinter', '^IInterface$', IInterfacePrinter)
+    pp.add_printer('CInterfacePrinter', '^C(Simple)?InterfaceOf<.*>$', CInterfacePrinter)
+    pp.add_printer('OwnedPrinter', '^(Shared|Owned)<.*>$', OwnedPrinter)
+    pp.add_printer('MapStringToMyClassPrinter', '^MapStringToMyClass<.*>$', MapStringToMyClassPrinter)
+    return pp
+
+gdb.printing.register_pretty_printer(
+    #gdb.current_objfile(),
+    gdb.objfiles()[0],
+    build_pretty_printer())
+
+class StackInfo (gdb.Command):
+    """ stack-info n m Shows backtrace for n frames with full local variable info for m interesting ones """
+
+    def __init__ (self):
+        super(StackInfo, self).__init__ ("stack-info", gdb.COMMAND_DATA)
+
+    def framecount():
+        n = 0
+        f = gdb.newest_frame()
+        while f:
+            n = n + 1
+            f = f.older()
+        return n
+
+    def isInterestingFrame(self):
+        f = gdb.selected_frame()
+        if not f:
+          return False
+        sal = f.find_sal()
+        if sal and sal.symtab and re.match("/hpcc-dev/", sal.symtab.filename):
+          return True
+        return False
+
+    def invoke (self, arg, from_tty):
+        self.allInteresting = set()
+        argv = gdb.string_to_argv(arg)
+        count = int(argv[0])
+        full = int(argv[1])
+        frames = StackInfo.framecount()-1
+        back = 0
+        while count and frames:
+          gdb.execute("up 0")   # prints current frame info
+          if full and (self.isInterestingFrame()):
+            gdb.execute("info locals")
+            full -= 1
+          gdb.execute("up-silently")
+          frames -= 1
+          count -= 1
+          back += 1
+        gdb.execute("down-silently " + str(back))
+
+StackInfo()
+
+class AllGlobals (gdb.Command):
+    """ all-globals shows all global variables defined in any module that is part of the HPCC platform """
+
+    def __init__ (self):
+        super(AllGlobals, self).__init__ ("all-globals", gdb.COMMAND_DATA)
+
+    def invoke (self, arg, from_tty):
+        ignoreFiles = set(['system/jlib/jlzw.cpp', 'system/jlib/jlog.cpp', 'system/jlib/jencrypt.cpp', 'system/jlib/jcrc.cpp',
+                           'system/security/zcrypt/aes.cpp', 'common/workunit/wuattr.cpp', 'ecl/hql/reservedwords.cpp'])
+        ignoreVars = set(['statsMetaData', 'roAttributes', 'roAttributeValues', 'RandomMain'])
+        ignorematch = re.compile(" StatisticsMapping ")
+        varmatch = re.compile("[^a-zA-Z_0-9:]([a-zA-Z_][a-z0-9_A-Z:]*)(\\[.*])?;$")
+        goodfilematch = re.compile("^File /hpcc-dev/HPCC-Platform/(.*[.]cpp):$")
+        filematch = re.compile("^File (.*):$")
+        infile = None
+        file_written = False
+        allvars = gdb.execute("info variables", False, True)
+        for line in allvars.splitlines():
+          m = goodfilematch.search(line)
+          if m:
+            infile = m.group(1)
+            file_written = False
+            if infile in ignoreFiles:
+              infile = None
+          elif filematch.search(line):
+            infile = None
+          elif infile:
+            if (ignorematch.search(line)):
+              continue
+            m = varmatch.search(line)
+            if m:
+              varname = m.group(1)
+              if varname in ignoreVars:
+                continue
+              sym = gdb.lookup_global_symbol(varname)
+              if not sym:
+                sym = gdb.lookup_static_symbol(varname)
+              if sym and not sym.is_constant: 
+                if not file_written:
+                  gdb.write('\n' + infile + ':\n')
+                  file_written = True
+                gdb.write('  {} = {}\n'.format(sym.name, sym.value(gdb.newest_frame())))
+              # There are some variables that gdb generates names incorrectly - e.g. if type is const char *const...
+              # We don't care about them... But uncomment the next two lines if you want to see them or if other things seem to be missing
+              # if not sym:
+              #  gdb.write(line + ' ' + varname+' not resolved\n')
+            elif line:
+              pass
+              # These are variables we didn't managed to parse the name of...
+              # gdb.write(line+'not parsed \n')
+            
+AllGlobals()
+
+#see https://sourceware.org/gdb/onlinedocs/gdb/Writing-a-Pretty_002dPrinter.html#Writing-a-Pretty_002dPrinter for more information on this

+ 0 - 2
system/jlib/jdebug.cpp

@@ -4077,8 +4077,6 @@ public:
     virtual void setMessageFields(unsigned _fields __attribute__((unused)) = MSGFIELD_all) {}
     virtual void addToPTree(IPropertyTree * parent __attribute__((unused))) const {}
     virtual int flush() { return 0; }
-    virtual char const *disable() { return 0; }
-    virtual void enable() {}
     virtual bool getLogName(StringBuffer &name __attribute__((unused))) const { return false; }
     virtual offset_t getLogPosition(StringBuffer &logFileName __attribute__((unused))) const { return 0; };
 

+ 77 - 41
system/jlib/jlog.cpp

@@ -1107,6 +1107,68 @@ void FileLogMsgHandlerXML::addToPTree(IPropertyTree * tree) const
     tree->addPropTree("handler", handlerTree);
 }
 
+// PostMortemLogMsgHandler
+
+PostMortemLogMsgHandler::PostMortemLogMsgHandler(const char * _filebase, unsigned _maxLinesToKeep, unsigned _messageFields)
+  : filebase(_filebase), maxLinesToKeep(_maxLinesToKeep), messageFields(_messageFields)
+{
+    openFile();
+}
+
+PostMortemLogMsgHandler::~PostMortemLogMsgHandler()
+{
+    closeAndDeleteEmpty(filename, handle);
+}
+
+void PostMortemLogMsgHandler::handleMessage(const LogMsg & msg)
+{
+    CriticalBlock block(crit);
+    if (handle)
+    {
+        checkRollover();
+        msg.fprintTable(handle, messageFields);
+        if(flushes)
+            fflush(handle);
+        linesInCurrent++;
+    }
+}
+
+void PostMortemLogMsgHandler::addToPTree(IPropertyTree * tree) const
+{
+}
+
+void PostMortemLogMsgHandler::checkRollover()
+{
+    if (linesInCurrent>=maxLinesToKeep)
+    {
+        doRollover();
+    }
+}
+
+void PostMortemLogMsgHandler::doRollover()
+{
+    closeAndDeleteEmpty(filename, handle);
+    handle = 0;
+    if (sequence > 0)
+    {
+        StringBuffer agedName;
+        agedName.append(filebase).append('.').append(sequence-1);
+        remove(agedName);
+    }
+    sequence++;
+    openFile();
+}
+
+void PostMortemLogMsgHandler::openFile()
+{
+    filename.clear().append(filebase).append('.').append(sequence);
+    recursiveCreateDirectoryForFile(filename.str());
+    handle = fopen(filename.str(), "wt");
+    if(!handle)
+        handle = getNullHandle();   // If we can't write where we expected, write to /dev/null instead
+    linesInCurrent = 0;
+}
+
 // RollingFileLogMsgHandler
 #define MIN_LOGFILE_SIZE_LIMIT 10000
 #define LOG_LINE_SIZE_ESTIMATE 80
@@ -1133,24 +1195,6 @@ RollingFileLogMsgHandler::~RollingFileLogMsgHandler()
     closeAndDeleteEmpty(filename,handle);
 }
 
-char const * RollingFileLogMsgHandler::disable()
-{
-    crit.enter();
-    fclose(handle);
-    return filename;
-}
-
-void RollingFileLogMsgHandler::enable()
-{
-    recursiveCreateDirectoryForFile(filename);
-    handle = fopen(filename, "a");
-    if(!handle) {
-        handle = getNullHandle();
-        assertex(!"RollingFileLogMsgHandler::enable : could not open file for output");
-    }
-    crit.leave();
-}
-
 void RollingFileLogMsgHandler::addToPTree(IPropertyTree * tree) const
 {
     IPropertyTree * handlerTree = createPTree(ipt_caseInsensitive);
@@ -1302,24 +1346,6 @@ void BinLogMsgHandler::addToPTree(IPropertyTree * tree) const
     tree->addPropTree("handler", handlerTree);
 }
 
-char const * BinLogMsgHandler::disable()
-{
-    crit.enter();
-    fstr.clear();
-    fio.clear();
-    return filename.get();
-}
-
-void BinLogMsgHandler::enable()
-{
-    fio.setown(file->open(IFOwrite));
-    if(!fio) assertex(!"BinLogMsgHandler::enable : Could not create IFileIO");
-    fstr.setown(createIOStream(fio));
-    if(!fstr) assertex(!"BinLogMsgHandler::enable : Could not create IFileIOStream");
-    fstr->seek(0, IFSend);
-    crit.leave();
-}
-
 // LogMsgComponentReporter
 
 void LogMsgComponentReporter::report(const LogMsgCategory & cat, const char * format, ...)
@@ -2414,6 +2440,11 @@ ILogMsgHandler * getBinLogMsgHandler(const char * filename, bool append)
     return new BinLogMsgHandler(filename, append);
 }
 
+ILogMsgHandler * getPostMortemLogMsgHandler(const char * filebase, unsigned maxLinesToKeep, unsigned messageFields)
+{
+    return new PostMortemLogMsgHandler(filebase, maxLinesToKeep, messageFields);
+}
+
 void installLogMsgFilterSwitch(ILogMsgHandler * handler, ILogMsgFilter * switchFilter, ILogMsgFilter * newFilter)
 {
     queryLogMsgManager()->changeMonitorFilterOwn(handler, getSwitchLogMsgFilterOwn(switchFilter, newFilter, queryLogMsgManager()->getMonitorFilter(handler)));
@@ -2616,8 +2647,6 @@ MODULE_EXIT()
     thePassAllFilter = nullptr;
 }
 
-#ifdef _CONTAINERIZED
-
 static constexpr const char * logFieldsAtt = "@fields";
 static constexpr const char * logMsgDetailAtt = "@detail";
 static constexpr const char * logMsgAudiencesAtt = "@audiences";
@@ -2625,8 +2654,9 @@ static constexpr const char * logMsgClassesAtt = "@classes";
 static constexpr const char * useLogQueueAtt = "@useLogQueue";
 static constexpr const char * logQueueLenAtt = "@queueLen";
 static constexpr const char * logQueueDropAtt = "@queueDrop";
-static constexpr const char * logQueueDisabledAtt = "@disabled";
+static constexpr const char * logDisabledAtt = "@disabled";
 static constexpr const char * useSysLogpAtt ="@enableSysLog";
+static constexpr const char * capturePostMortemAtt ="@postMortem";
 
 #ifdef _DEBUG
 static constexpr bool useQueueDefault = false;
@@ -2643,7 +2673,7 @@ void setupContainerizedLogMsgHandler()
     Owned<IPropertyTree> logConfig = getComponentConfigSP()->getPropTree("logging");
     if (logConfig)
     {
-        if (logConfig->getPropBool(logQueueDisabledAtt, false))
+        if (logConfig->getPropBool(logDisabledAtt, false))
         {
             removeLog();
             return;
@@ -2688,9 +2718,15 @@ void setupContainerizedLogMsgHandler()
 
         if (logConfig->getPropBool(useSysLogpAtt, useSysLogDefault))
             UseSysLogForOperatorMessages();
+
+        unsigned postMortemLines = logConfig->getPropInt(capturePostMortemAtt, 0);
+        if (postMortemLines)
+        {
+            ILogMsgHandler *fileMsgHandler = getPostMortemLogMsgHandler("/tmp/postmortem.log", postMortemLines, MSGFIELD_STANDARD);
+            queryLogMsgManager()->addMonitorOwn(fileMsgHandler, getCategoryLogMsgFilter(MSGAUD_all, MSGCLS_all, TopDetail));
+        }
     }
 }
-#endif
 
 ILogMsgManager * queryLogMsgManager()
 {

+ 1 - 2
system/jlib/jlog.hpp

@@ -644,8 +644,6 @@ interface jlib_decl ILogMsgHandler : public IInterface
     virtual void              setMessageFields(unsigned _fields = MSGFIELD_all) = 0;
     virtual void              addToPTree(IPropertyTree * parent) const = 0;
     virtual int               flush() { return 0; }
-    virtual char const *      disable() { return 0; }
-    virtual void              enable() {}
     virtual bool              getLogName(StringBuffer &name) const = 0;
     virtual offset_t          getLogPosition(StringBuffer &logFileName) const = 0;
 };
@@ -820,6 +818,7 @@ extern jlib_decl ILogMsgHandler * getHandleLogMsgHandler(FILE * handle = stderr,
 extern jlib_decl ILogMsgHandler * getFileLogMsgHandler(const char * filename, const char * headertext = 0, unsigned fields = MSGFIELD_all, bool writeXML = true, bool append = false, bool flushes = true);
 extern jlib_decl ILogMsgHandler * getRollingFileLogMsgHandler(const char * filebase, const char * fileextn, unsigned fields = MSGFIELD_all, bool append = false, bool flushes = true, const char *initialName = NULL, const char *alias = NULL, bool daily = false, long maxLogSize = 0);
 extern jlib_decl ILogMsgHandler * getBinLogMsgHandler(const char * filename, bool append = false);
+extern jlib_decl ILogMsgHandler * getPostMortemLogMsgHandler(const char * filebase, unsigned maxLinesToKeep, unsigned _messageFields=MSGFIELD_all);
 
 // Function to install switch filter into a monitor, switch some messages to new filter whilst leaving rest to previous filter
 

+ 30 - 6
system/jlib/jlog.ipp

@@ -541,6 +541,36 @@ private:
     bool                      prepped;
 };
 
+class PostMortemLogMsgHandler : public CInterfaceOf<ILogMsgHandler>
+{
+public:
+    PostMortemLogMsgHandler(const char * _filebase, unsigned _maxLinesToKeep, unsigned _messageFields=MSGFIELD_all);
+    virtual ~PostMortemLogMsgHandler();
+    virtual void handleMessage(const LogMsg & msg) override;
+    virtual bool needsPrep() const override { return false; }
+    virtual void prep() override {}
+    virtual unsigned queryMessageFields() const override { return messageFields; }
+    virtual void setMessageFields(unsigned _fields) override { messageFields = _fields; }
+    virtual void addToPTree(IPropertyTree * tree) const override;
+    virtual int flush() override { CriticalBlock block(crit); return fflush(handle); }
+    virtual bool getLogName(StringBuffer &name) const override { CriticalBlock block(crit); name.append(filename); return true; }
+    virtual offset_t getLogPosition(StringBuffer &name) const override { CriticalBlock block(crit); fflush(handle); name.append(filename); return ftell(handle); }
+protected:
+    void checkRollover();
+    void doRollover();
+    void openFile();
+protected:
+    mutable FILE *handle = nullptr;
+    StringAttr filebase;
+    mutable StringBuffer filename;
+    mutable CriticalSection crit;
+    const unsigned maxLinesToKeep = 0;
+    unsigned messageFields = MSGFIELD_all;
+    unsigned linesInCurrent = 0;
+    unsigned sequence = 0;
+    const bool flushes = true;
+};
+
 class RollingFileLogMsgHandler : implements ILogMsgHandler, public CInterface
 {
 public:
@@ -570,8 +600,6 @@ public:
     void                      setMessageFields(unsigned _fields) { messageFields = _fields; }
     void                      addToPTree(IPropertyTree * tree) const;
     int                       flush() { CriticalBlock block(crit); return fflush(handle); }
-    char const *              disable();
-    void                      enable();
     bool                      getLogName(StringBuffer &name) const { CriticalBlock block(crit); name.append(filename); return true; }
     offset_t                  getLogPosition(StringBuffer &name) const { CriticalBlock block(crit); fflush(handle); name.append(filename); return ftell(handle); }
 protected:
@@ -610,8 +638,6 @@ public:
     unsigned                  queryMessageFields() const { return MSGFIELD_all; }
     void                      setMessageFields(unsigned _fields) {}
     int                       flush() { return 0; }
-    char const *              disable();
-    void                      enable();
     bool                      getLogName(StringBuffer &name) const { name.append(filename); return true; }
     offset_t                  getLogPosition(StringBuffer &name) const { CriticalBlock block(crit); name.append(filename); return fstr->tell(); }
 protected:
@@ -638,8 +664,6 @@ public:
     unsigned                  queryMessageFields() const { return fields; }
     void                      setMessageFields(unsigned _fields) { fields = _fields; }
     int                       flush() { return 0; }
-    char const *              disable() { return "Audit"; }
-    void                      enable() {}
     bool                      getLogName(StringBuffer &name) const { return false; }
     offset_t                  getLogPosition(StringBuffer &logFileName) const { return 0; }
 protected:

+ 12 - 1
tools/wutool/wutool.cpp

@@ -93,6 +93,7 @@ void usage(const char * action = nullptr)
                "   archive <workunits> - Archive to xml files [TO=<directory>] [DEL=1] [DELETERESULTS=1] [INCLUDEFILES=1]\n"
                "   restore <filenames> - Restore from xml files [INCLUDEFILES=1]\n"
                "   importzap <zapreport-filename> <output-helper-directory> [<zapreport-password>]\n"
+               "   postmortem <workunit> PMD=<dir> - Add post-mortem info\n"
                 "\n"
                "   orphans             - Delete orphaned information from store\n"
                "   cleanup [days=NN]   - Delete workunits older than NN days\n"
@@ -239,6 +240,16 @@ static void process(IConstWorkUnit &w, IProperties *globals, const StringArray &
         }
         printf("deleted %s\n", wuid.str());
     }
+    else if (stricmp(action, "postmortem")==0)
+    {
+        Owned<IWorkUnitFactory> factory = getWorkUnitFactory();
+        StringBuffer postMortemDirectory;
+        globals->getProp("PMD", postMortemDirectory);
+        StringAttr wuid(w.queryWuid());
+        Owned<IWorkUnit> lw = factory->updateWorkUnit(wuid);
+        Owned<IWUQuery> query = lw->updateQuery();
+        associateLocalFile(query, FileTypePostMortem, postMortemDirectory, "PostMortem", 0);
+    }
     else if (stricmp(action, "archive")==0)
     {
         Owned<IWorkUnitFactory> factory = getWorkUnitFactory();
@@ -622,7 +633,7 @@ int main(int argc, const char *argv[])
         {
             usage();
         }
-        else if (strieq(action, "list") || strieq(action, "dump") || strieq(action, "results") || strieq(action, "delete") || strieq(action, "archive") || strieq(action, "info") || strieq(action, "analyze"))
+        else if (strieq(action, "list") || strieq(action, "dump") || strieq(action, "results") || strieq(action, "delete") || strieq(action, "archive") || strieq(action, "info") || strieq(action, "analyze") || strieq(action, "postmortem"))
         {
             if (strieq(action, "info") && args.empty())
                 args.append("source[all],properties[all]");