bin/build_search_documents.py: handle multi-line indented text better

Beyond replacing newlines that show up in the middle of a text, remove the whitespace following the newline (which is the indentation) as well. Signed-off-by: Göktürk Yüksek <gokturk@gentoo.org>
author: Göktürk Yüksek <gokturk@gentoo.org> 2019-12-25 20:37:23 -0500
committer: Göktürk Yüksek <gokturk@gentoo.org> 2019-12-25 20:37:23 -0500
commit: 1029f9c624e3f3bf252f20197f357cca00a20410 (patch)
tree: 4ff67aa7775d2fd90f7dba6293d114e41bbc2f01 /bin
parent: bin/build_search_documents.py: create documents from tables (diff)
download: devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.tar.gz
devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.tar.bz2
devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.zip
1 files changed, 9 insertions, 3 deletions
diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py
index 1aac495..38ffd24 100755
--- a/bin/build_search_documents.py
+++ b/bin/build_search_documents.py
@@ -5,6 +5,12 @@ import json
 import os.path
 import sys
 import xml.etree.ElementTree as ET
+import re
+
+
+# The regex for stripping a newline and the possible indentation
+# whitespace following it in multiline content
+whitespace_re = re.compile(r'\n[ \t]*', flags=re.M)
 
 
 def stringify_node(parent: ET.Element) -> str:
@@ -28,7 +34,7 @@ def stringify_node(parent: ET.Element) -> str:
 
     # For each child, strip the tags and append to text
     # along with the tail text following it.
-    # The tail may include '\n' if it spans multiple lines.
+    # The tail may include '\n', '\t', ' ' if it spans multiple lines.
     # We will worry about those on return, not now.
     for child in parent:
         # The '<d/>' tag is simply a fancier '-' character
@@ -42,8 +48,8 @@ def stringify_node(parent: ET.Element) -> str:
     # A paragraph typically ends with:
     #   Text\n</p>
     # Right strip any spurious whitespace.
-    # Finally, get rid of any intermediate newlines.
-    return text.rstrip().replace('\n', ' ')
+    # Finally, get rid of any intermediate newlines and indentation whitespace.
+    return whitespace_re.sub(' ', text.rstrip())
 
 
 def process_node(documents: list, node: ET.Element, name: str, url: str) -> None:
author	Göktürk Yüksek <gokturk@gentoo.org>	2019-12-25 20:37:23 -0500
committer	Göktürk Yüksek <gokturk@gentoo.org>	2019-12-25 20:37:23 -0500
commit	1029f9c624e3f3bf252f20197f357cca00a20410 (patch)
tree	4ff67aa7775d2fd90f7dba6293d114e41bbc2f01 /bin
parent	bin/build_search_documents.py: create documents from tables (diff)
download	devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.tar.gz devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.tar.bz2 devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.zip