diff --git a/otc_doc_convertor/convertor.py b/otc_doc_convertor/convertor.py index 3bfbe3a37..c16ca1c79 100644 --- a/otc_doc_convertor/convertor.py +++ b/otc_doc_convertor/convertor.py @@ -221,7 +221,7 @@ class OTCDocConvertor: else: logging.debug( "Not placing replaced anchor %s " - " since it already existed", + "since it already existed", local_ref, ) @@ -319,17 +319,6 @@ class OTCDocConvertor: new = f"{em.string}" em.replace_with(bs4.BeautifulSoup(new, "html.parser")) - for p in soup.body.find_all(string=re.compile(r"(/\*).+")): - if p.string and p.parent.name == "p": - p.string.replace_with(p.string.replace("/*", "/``*``")) - - # MRS UMN contains: /opt/Bigdata/FusionInsight_Porter_8.*/foo-*/ - # This is a pretty special case and we do not want to apply that widely - # therefore only looking for [.-]*/ combinations - for p in soup.body.find_all(string=re.compile(r"([\.-]\*/).+")): - if p.string and p.parent.name == "p": - p.string.replace_with(p.string.replace("*/", "``*``/")) - escape_asterisk_re = r"\((\*)[\.,]" for p in soup.body.find_all(string=re.compile(escape_asterisk_re)): if p.string and p.parent.name not in ["b", "strong", "pre"]: @@ -356,6 +345,8 @@ class OTCDocConvertor: r"\(([\W\x60_]{10,})\)", # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+ r"\s([^a-zA-Z0-9\s]{8,})", + # MRS operation guide contain: /*+ MAPJOIN(join_table) \*/ + r"\s(/\*.*\*/)", # BMS API contain sequence in a dedicated paragraph r"^([^a-zA-Z0-9\s]{10,})$", # OBS special chars - "\$" "\\" etc @@ -364,6 +355,8 @@ class OTCDocConvertor: r"\s(urn:smn:\(.*)\.", # "-" only (in tables) is considered as list r"^(-)$", + # MRS component guide has: "./mydate_\\\\d*/" + r"\w(_)\\", ] for to_rawize in rawize_strings: for p in soup.body.find_all(string=re.compile(to_rawize)): @@ -386,6 +379,20 @@ class OTCDocConvertor: "Cannot find string for rawization anymore" ) + # Pandoc seem to be not escaping properly asterists which are + # immediately following non word chars + # (https://docutils.sourceforge.io/docs/ref/rst/restructuredtext.html#toc-entry-44) + # NOTE(gtema): + # 1. this is on purpose placed here since we want to have some special + # escapings above + # 2. we are not escaping asterisks at the end of the paragraphs (pandoc + # deals correctly with that) + re_escape = re.compile(r"([-:/'\"<\([{])(\*+)(.+)") + for p in soup.body.find_all(string=re_escape): + if p.string and p.parent.name == "p": + p.string.replace_with( + re.sub(re_escape, r"\1``\2``\3", p.string)) + # Drop parent link at the bottom of the page for parent in soup.body.find_all("p", class_="familylinks"): parent.decompose() @@ -555,8 +562,20 @@ class OTCDocConvertor: writer.write(f".. _{f.name.replace('.html', '')}:\n\n") # post process some usual stuff for line in reader.readlines(): - processed_line = re.sub(r"\.\.\\\\_", ".. _", line) - processed_line = re.sub(r"\.\.\\_", ".. _", processed_line) + processed_line = re.sub( + r"\.\.\\\\_(.*):$", r".. _\1:", line) + # replace anchor when it is itself inside some other block + # (i.e. table) + processed_line = re.sub( + r"\.\.\\\\_(.*):\s", r".. _\1: ", processed_line) + # For some reason regex locally and in zuul are not + # behaving same - thus same but different + processed_line = re.sub( + r"\.\.\\_(.*):$", r".. _\1:", processed_line) + # replace anchor when it is itself inside some other block + # (i.e. table) + processed_line = re.sub( + r"\.\.\\_(.*):\s", r".. _\1: ", processed_line) # We could get unwanted anchors from pandoc - get rid of # them anchor = re.search(r"\.\. \_(.*):", processed_line) @@ -566,7 +585,9 @@ class OTCDocConvertor: ): # This is most likely some duplicated anchor. It is # not referred from any other place so drop it - logging.info("Dropping not referred anchor") + logging.info( + "Dropping not referred anchor '%s'", + anchor.group(1)) continue processed_line = re.sub(