diff options
author | 2019-12-25 20:37:23 -0500 | |
---|---|---|
committer | 2019-12-25 20:37:23 -0500 | |
commit | 1029f9c624e3f3bf252f20197f357cca00a20410 (patch) | |
tree | 4ff67aa7775d2fd90f7dba6293d114e41bbc2f01 /bin | |
parent | bin/build_search_documents.py: create documents from tables (diff) | |
download | devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.tar.gz devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.tar.bz2 devmanual-1029f9c624e3f3bf252f20197f357cca00a20410.zip |
bin/build_search_documents.py: handle multi-line indented text better
Beyond replacing newlines that show up in the middle of a text, remove
the whitespace following the newline (which is the indentation) as
well.
Signed-off-by: Göktürk Yüksek <gokturk@gentoo.org>
Diffstat (limited to 'bin')
-rwxr-xr-x | bin/build_search_documents.py | 12 |
1 files changed, 9 insertions, 3 deletions
diff --git a/bin/build_search_documents.py b/bin/build_search_documents.py index 1aac495..38ffd24 100755 --- a/bin/build_search_documents.py +++ b/bin/build_search_documents.py @@ -5,6 +5,12 @@ import json import os.path import sys import xml.etree.ElementTree as ET +import re + + +# The regex for stripping a newline and the possible indentation +# whitespace following it in multiline content +whitespace_re = re.compile(r'\n[ \t]*', flags=re.M) def stringify_node(parent: ET.Element) -> str: @@ -28,7 +34,7 @@ def stringify_node(parent: ET.Element) -> str: # For each child, strip the tags and append to text # along with the tail text following it. - # The tail may include '\n' if it spans multiple lines. + # The tail may include '\n', '\t', ' ' if it spans multiple lines. # We will worry about those on return, not now. for child in parent: # The '<d/>' tag is simply a fancier '-' character @@ -42,8 +48,8 @@ def stringify_node(parent: ET.Element) -> str: # A paragraph typically ends with: # Text\n</p> # Right strip any spurious whitespace. - # Finally, get rid of any intermediate newlines. - return text.rstrip().replace('\n', ' ') + # Finally, get rid of any intermediate newlines and indentation whitespace. + return whitespace_re.sub(' ', text.rstrip()) def process_node(documents: list, node: ET.Element, name: str, url: str) -> None: |