From d62f37a04421fe7a12035772625e7bed04884988 Mon Sep 17 00:00:00 2001
From: Ralph Castain <rhc@pmix.org>
Date: Wed, 28 Apr 2021 15:54:38 -0700
Subject: [PATCH] Be more robust in getting hwloc topology

If the XML parsing fails, fall back to discovery instead
of returning an error that causes the job to abort.

Signed-off-by: Ralph Castain <rhc@pmix.org>
---
 opal/mca/hwloc/base/hwloc_base_util.c | 11 ++++++++---
 1 file changed, 8 insertions(+), 3 deletions(-)

diff --git a/opal/mca/hwloc/base/hwloc_base_util.c b/opal/mca/hwloc/base/hwloc_base_util.c
index 5afa0b9d80..05bdbbda14 100644
--- a/opal/mca/hwloc/base/hwloc_base_util.c
+++ b/opal/mca/hwloc/base/hwloc_base_util.c
@@ -21,6 +21,7 @@
  * Copyright (c) 2018      Amazon.com, Inc. or its affiliates.  All Rights reserved.
  * Copyright (c) 2019-2021 IBM Corporation. All rights reserved.
  * Copyright (c) 2019-2020 Inria.  All rights reserved.
+ * Copyright (c) 2021      Nanook Consulting.  All rights reserved.
  * $COPYRIGHT$
  *
  * Additional copyrights may follow
@@ -369,9 +370,10 @@ int opal_hwloc_base_get_topology(void)
             return OPAL_ERROR;
         }
         if (0 != hwloc_topology_set_xmlbuffer(opal_hwloc_topology, val, strlen(val) + 1)) {
+            /* default to discovery */
             free(val);
             hwloc_topology_destroy(opal_hwloc_topology);
-            return OPAL_ERROR;
+            goto discover;
         }
         /* since we are loading this from an external source, we have to
          * explicitly set a flag so hwloc sets things up correctly
@@ -379,15 +381,17 @@ int opal_hwloc_base_get_topology(void)
         if (0
             != opal_hwloc_base_topology_set_flags(opal_hwloc_topology,
                                                   HWLOC_TOPOLOGY_FLAG_IS_THISSYSTEM, true)) {
+            /* default to discovery */
             hwloc_topology_destroy(opal_hwloc_topology);
             free(val);
-            return OPAL_ERROR;
+            goto discover;
         }
         /* now load the topology */
         if (0 != hwloc_topology_load(opal_hwloc_topology)) {
+            /* default to discovery */
             hwloc_topology_destroy(opal_hwloc_topology);
             free(val);
-            return OPAL_ERROR;
+            goto discover;
         }
         free(val);
         /* filter the cpus thru any default cpu set */
@@ -396,6 +400,7 @@ int opal_hwloc_base_get_topology(void)
             return rc;
         }
     } else {
+    discover:
         opal_output_verbose(1, opal_hwloc_base_framework.framework_output,
                             "hwloc:base discovering topology");
         if (0 != hwloc_topology_init(&opal_hwloc_topology)
-- 
GitLab