From 901e00be52fa02ab4a39fb8bbdfcec1b78318651 Mon Sep 17 00:00:00 2001
From: gtema <artem.goncharov@gmail.com>
Date: Tue, 27 Sep 2022 12:21:11 +0000
Subject: [PATCH] Another series of conversion fixes

Reviewed-by: Hasko, Vladimir <vladimir.hasko@t-systems.com>
Co-authored-by: gtema <artem.goncharov@gmail.com>
Co-committed-by: gtema <artem.goncharov@gmail.com>
---
 otc_doc_convertor/convertor.py | 458 +++++++++++++++++++--------------
 otc_doc_convertor/process.py   | 409 -----------------------------
 2 files changed, 259 insertions(+), 608 deletions(-)
 delete mode 100644 otc_doc_convertor/process.py

diff --git a/otc_doc_convertor/convertor.py b/otc_doc_convertor/convertor.py
index a62e22d5c..5abd71902 100644
--- a/otc_doc_convertor/convertor.py
+++ b/otc_doc_convertor/convertor.py
@@ -13,24 +13,23 @@ from jinja2 import FileSystemLoader, Environment, select_autoescape
 
 
 class OTCDocConvertor:
-
     def __init__(self):
         self.doc_anchors = dict()
         self.doc_links = dict()
 
     @staticmethod
     def get_new_name(current_name):
-        new_name = current_name.replace(' - ', '_')
-        new_name = new_name.replace(' ', '_')
-        new_name = new_name.replace('/', '_')
-        new_name = new_name.replace('\'', '')
-        new_name = new_name.replace('"', '')
-        new_name = new_name.replace('`', '')
-        new_name = new_name.replace('´', '')
-        new_name = new_name.replace(':', '')
-        new_name = new_name.replace('?', '')
-        new_name = new_name.replace('(', '')
-        new_name = new_name.replace(')', '')
+        new_name = current_name.replace(" - ", "_")
+        new_name = new_name.replace(" ", "_")
+        new_name = new_name.replace("/", "_")
+        new_name = new_name.replace("'", "")
+        new_name = new_name.replace('"', "")
+        new_name = new_name.replace("`", "")
+        new_name = new_name.replace("´", "")
+        new_name = new_name.replace(":", "")
+        new_name = new_name.replace("?", "")
+        new_name = new_name.replace("(", "")
+        new_name = new_name.replace(")", "")
         new_name = new_name.lower()
         return new_name
 
@@ -38,7 +37,7 @@ class OTCDocConvertor:
     def build_doc_tree(metadata):
         flat_tree = dict()
         for k, v in metadata.items():
-            parent_id = v.get('p_code')
+            parent_id = v.get("p_code")
             if not parent_id:
                 parent_id = 0
 
@@ -48,19 +47,18 @@ class OTCDocConvertor:
         return flat_tree
 
     @classmethod
-    def get_target_path(cls, code, metadata, path=''):
+    def get_target_path(cls, code, metadata):
         if code in metadata:
             current = metadata[code]
-            if not current.get('p_code'):
-                return current['new_name']
+            if not current.get("p_code"):
+                return current["new_name"]
             else:
-                return (
-                    "{0}/{1}".format(
-                        cls.get_target_path(current['p_code'], metadata),
-                        current['new_name'])
+                return "{0}/{1}".format(
+                    cls.get_target_path(current["p_code"], metadata),
+                    current["new_name"],
                 )
         else:
-            return ''
+            return ""
 
     def make_label(self, soup, name):
         label = soup.new_tag("p")
@@ -70,8 +68,8 @@ class OTCDocConvertor:
     def is_element_referred(self, ref, fname):
         return (
             ref in self.doc_links
-            or '#' + ref in self.doc_links
-            or fname + '#' + ref in self.doc_links
+            or "#" + ref in self.doc_links
+            or fname + "#" + ref in self.doc_links
         )
 
     def streamline_html(self, soup, file_name):
@@ -92,105 +90,127 @@ class OTCDocConvertor:
                 lnk.decompose()
 
         # Process divs
-        for i in soup.body.find_all('div'):
-            if "note" in i.get('class', []):
+        for i in soup.body.find_all("div"):
+            if "note" in i.get("class", []):
                 # Notes
-                del i['id']
+                del i["id"]
                 if i.img:
                     i.img.decompose()
-                notetitle = i.find('span', class_='notetitle')
+                notetitle = i.find("span", class_="notetitle")
                 if notetitle:
-                    title = soup.new_tag('div')
-                    title['class'] = 'title'
-                    title.string = 'Note:'
+                    title = soup.new_tag("div")
+                    title["class"] = "title"
+                    title.string = "Note:"
                     notetitle.replace_with(title)
-            elif "warning" in i.get('class', []):
+            elif "warning" in i.get("class", []):
                 # Warnings
-                del i['id']
+                del i["id"]
                 if i.img:
                     i.img.decompose()
-                eltitle = i.find('span', class_='warningtitle')
+                eltitle = i.find("span", class_="warningtitle")
                 if eltitle:
-                    title = soup.new_tag('div')
-                    title['class'] = 'title'
-                    title.string = 'Warning:'
+                    title = soup.new_tag("div")
+                    title["class"] = "title"
+                    title.string = "Warning:"
                     eltitle.replace_with(title)
-            elif "notice" in i.get('class', []):
+            elif "notice" in i.get("class", []):
                 # Notices
-                del i['id']
+                del i["id"]
                 if i.img:
                     i.img.decompose()
-                i['class'] = 'important'
-            elif "caution" in i.get('class', []):
+                i["class"] = "important"
+            elif "caution" in i.get("class", []):
                 # Cautions
-                del i['id']
+                del i["id"]
                 if i.img:
                     i.img.decompose()
-            elif "fignone" in i.get('class', []):
+            elif "fignone" in i.get("class", []):
                 # Figures
                 # When we found figure generate local label (anchor)
-                if i.get('id'):
-                    logging.debug('place figure label')
+                if i.get("id"):
+                    logging.debug("place figure label")
                     i.insert_before(self.make_label(soup, i.get("id")))
-                figure = soup.new_tag('figure')
-                img = i.find('img')
-                cap = i.find('span', class_='figcap')
+                figure = soup.new_tag("figure")
+                img = i.find("img")
+                cap = i.find("span", class_="figcap")
                 if cap is not None:
-                    cap.name = 'figcaption'
+                    cap.name = "figcaption"
                     figure.append(cap)
                 if img:
                     # Store all referred images for copying
-                    self.doc_images.add(img['src'])
-                    img['src'] = '/_static/images/' + img['src']
+                    self.doc_images.add(img["src"])
+                    img["src"] = "/_static/images/" + img["src"]
                     del img["width"]
                     del img["height"]
                     del img["class"]
                     del img["title"]
                     figure.append(img)
                 i.replace_with(figure)
-            elif "section" in i.get('class', []):
+            elif "section" in i.get("class", []):
                 # Sections
-                # When we found section generate local label (anchor)
-                if i.get('id'):
+                if i.get("id"):
+                    # When we found section generate local label (anchor)
                     sec_id = i.get("id").lower()
                     if self.is_element_referred(sec_id, file_name):
-                        logging.debug('Add section label')
                         page_anchors.add(sec_id)
                         i.insert_before(self.make_label(soup, sec_id))
-                # and still convert to paragraph
-                i.name = 'p'
+                i.unwrap()
+            elif i.get("id") and i.get("id").startswith("body"):
+                i.unwrap()
             else:
-                i.name = 'p'
+                i.name = "p"
 
         # Process remaining images
-        for img in soup.body.find_all('img'):
-            if img['src'] and not img['src'].startswith('/_static/images'):
-                self.doc_images.add(img['src'])
-                img['src'] = '/_static/images/' + img['src']
+        for img in soup.body.find_all("img"):
+            if img["src"] and not img["src"].startswith("/_static/images"):
+                self.doc_images.add(img["src"])
+                img["src"] = "/_static/images/" + img["src"]
                 del img["width"]
                 del img["height"]
                 del img["class"]
                 del img["title"]
+                del img["id"]
 
         # Drop strong in table headers "/"
-        for th in soup.body.find_all('th'):
+        for th in soup.body.find_all("th"):
             if th.p.strong:
                 th.p.strong.unwrap()
 
         if self.args.improve_table_headers:
             # Add spaces around "/"
-            for th in soup.body.find_all('th'):
-                if hasattr(th, 'p') and th.p.string:
-                    th.p.string = re.sub(
-                        r'\b/\b',
-                        ' / ',
-                        th.p.string)
+            for th in soup.body.find_all("th"):
+                if hasattr(th, "p") and th.p.string:
+                    th.p.string = re.sub(r"\b/\b", " / ", th.p.string)
 
         # Drop strong around links "/"
-        for strong in soup.body.find_all('strong'):
+        for strong in soup.body.find_all("strong"):
             if strong.a:
                 strong.unwrap()
 
+        # table anchors - some tables are referred. Some are having anchor in
+        # front, some not. In order to cope with that we analyze every table
+        # and if it is referred - prepend anchor. Next anchor processing will
+        # skiip it, since such anchor is already placed on the page
+        for table in soup.body.find_all("table"):
+            # Verify this is really called from somewhere:
+            if table.get("id"):
+                local_ref = table["id"].lower()
+                if self.is_element_referred(local_ref, file_name):
+                    # We now know something in the document wants this anchor -
+                    # replace it with label
+                    if local_ref not in page_anchors:
+                        lnk = bs4.BeautifulSoup(
+                            f"<p>..\\_{local_ref}:</p>", "html.parser"
+                        )
+                        table.insert_before(lnk)
+                        page_anchors.add(local_ref)
+                    else:
+                        logging.debug(
+                            "Not placing replaced anchor %s "
+                            " since it already existed",
+                            local_ref,
+                        )
+
         # local anchors
         for lnk in soup.body.find_all("a"):
             if (
@@ -208,19 +228,22 @@ class OTCDocConvertor:
                     # We now know something in the document wants this anchor -
                     # replace it with label
                     if local_ref not in page_anchors:
+                        logging.debug("Adding anchor")
                         lnk.name = "p"
                         lnk.string = f"..\\_{local_ref}:"
                         del lnk["name"]
                         page_anchors.add(local_ref)
                     else:
                         logging.debug(
-                            f"Not placing replaced anchor {local_ref} "
-                            f" since it already existed")
+                            "Not placing replaced anchor %s "
+                            " since it already existed",
+                            local_ref,
+                        )
                 else:
-                    logging.debug("Dropping unreferred link")
+                    logging.debug("Dropping unreferred link %s", lnk)
 
         for li in soup.body.find_all("li"):
-            del li['id']
+            del li["id"]
 
         # Sometimes we have code blocks with line numbers.
         # <div class="codecoloring" codetype="xxx"><table class="xxx">
@@ -239,22 +262,31 @@ class OTCDocConvertor:
             text = pre.get_text()
             # if text.startswith("{"):
             #    pre["class"] = "data"
-            if re.search(
-                r'\[[a-z]*@\w+.*\][\s#>]?',
-                text
-            ):
+            if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text):
                 # Something like "[root@ecs-test-0001 ~]#"
                 pre["class"] = "console"
-            elif re.match(
-                r'^(GET|PUT|POST|DELETE)',
-                text
-            ):
+            elif re.match(r"^(GET|PUT|POST|DELETE)", text):
                 # Something like "DELETE https://some_url"
                 pre["class"] = "text"
             if "codeblock" in pre.get("class", []):
                 # <pre class="codeblock"
                 pre["class"] = "text"
 
+        escape_asterisk_re = r"\((\*)[\.,]"
+        for p in soup.body.find_all(string=re.compile(escape_asterisk_re)):
+            if p.string:
+                curr = p.string
+                part = re.search(escape_asterisk_re, curr)
+                # If we have `<b> all files (*.*)</b>` - no need to escape
+                if len(part.groups()) > 0 and p.parent.name != "b":
+                    logging.debug(
+                        "Found asterisks to escape: %s", part.group(1)
+                    )
+                    new = curr.replace(
+                        part.group(1), f"<code>{part.group(1)}</code>"
+                    )
+                    p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
+
         # And now specialities
         rawize_strings = [
             # "\*\*\*\*\*\*",
@@ -265,6 +297,8 @@ class OTCDocConvertor:
             r"(\*name)",
             # DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
             r"\(([\W\x60_]{10,})\)",
+            # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
+            r"\s([^a-zA-Z0-9\s]{10,})",
         ]
         for to_rawize in rawize_strings:
             for p in soup.body.find_all(string=re.compile(to_rawize)):
@@ -272,102 +306,104 @@ class OTCDocConvertor:
                     curr = p.string
                     part = re.search(to_rawize, curr)
                     if len(part.groups()) > 0:
-                        new = curr.replace(
-                            part.group(1),
-                            f"<code>{part.group(1)}</code>"
+                        logging.debug(
+                            "Found element to rawize: %s", part.group(1)
                         )
-                        p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
-                        print(part.group(1))
-                        print(f"New content is {p.string}")
+                        new = curr.replace(
+                            part.group(1), f"<code>{part.group(1)}</code>"
+                        )
+                        logging.debug("Replacing string with: %s", new)
+                        p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
+                        logging.debug("Replacing string with: %s", p.string)
                     else:
-                        print('ups')
+                        logging.error(
+                            "Cannot find string for rawization anymore"
+                        )
                 logging.error(f"String with star: {p}")
 
+        # Drop parent link at the bottom of the page
+        for parent in soup.body.find_all("p", class_="familylinks"):
+            parent.decompose()
+
         return soup.body
 
     def main(self):
         logging.basicConfig(level=logging.DEBUG)
-        parser = argparse.ArgumentParser(description='Process links.')
+        parser = argparse.ArgumentParser(description="Process links.")
+        parser.add_argument("path", type=str, help="path to the files")
         parser.add_argument(
-            'path', type=str, help='path to the files')
+            "--improve-table-headers",
+            action="store_true",
+            help="Improve table headers by enforcing spaces around `/`",
+        )
         parser.add_argument(
-            '--improve-table-headers', action='store_true',
-            help='Improve table headers by enforcing spaces around `/`')
+            "--pygments-lexer", help="Set particular code-block lexer language"
+        )
         parser.add_argument(
-            '--pygments-lexer',
-            help='Set particular code-block lexer language')
+            "--dest", help="Directory to write resulting files"
+        )
+        parser.add_argument("--title", required=True, help="Document title")
         parser.add_argument(
-            '--dest',
-            help='Directory to write resulting files')
+            "--service", help="Service to which the document belongs to"
+        )
+        parser.add_argument("--repo-name", help="Service repository")
+        parser.add_argument("--pdf-name", help="PDF File name")
         parser.add_argument(
-            '--title',
-            required=True,
-            help='Document title')
-        parser.add_argument(
-            '--service',
-            help='Service to which the document belongs to')
-        parser.add_argument(
-            '--repo-name',
-            help='Service repository')
-        parser.add_argument(
-            '--pdf-name',
-            help='PDF File name')
-        parser.add_argument(
-            '--templates-location',
-            default='templates',
-            help='Location of additional templates')
+            "--templates-location",
+            default="templates",
+            help="Location of additional templates",
+        )
         self.args = parser.parse_args()
         if self.args.dest:
             dest = pathlib.Path(self.args.dest)
         else:
-            dest = pathlib.Path(self.args.path, 'result')
+            dest = pathlib.Path(self.args.path, "result")
         dest.mkdir(parents=True, exist_ok=True)
 
-        metadata_file = pathlib.Path(
-            self.args.path, "CLASS.TXT.json")
+        metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
         meta_data = dict()
 
         if not metadata_file.exists():
             logging.warning(
                 f"CLASS.TXT.json file is missing in {self.args.path}, "
-                f"assuming initial import")
+                f"assuming initial import"
+            )
             with open(pathlib.Path(dest, "index.rst"), "w") as index:
-                index.write('=' * (len(self.args.title)) + '\n')
-                index.write(self.args.title + '\n')
-                index.write('=' * (len(self.args.title)) + '\n')
-                index.write('\n')
+                index.write("=" * (len(self.args.title)) + "\n")
+                index.write(self.args.title + "\n")
+                index.write("=" * (len(self.args.title)) + "\n")
+                index.write("\n")
         else:
             meta_data = json.loads(open(metadata_file).read())
         metadata_by_uri = dict()
         metadata_by_code = dict()
         self.doc_images = set()
         for f in meta_data:
-            f['new_name'] = self.get_new_name(f['title'])
-            metadata_by_uri[f['uri']] = f
-            metadata_by_code[f.get('code')] = f
+            f["new_name"] = self.get_new_name(f["title"])
+            metadata_by_uri[f["uri"]] = f
+            metadata_by_code[f.get("code")] = f
 
         tree = self.build_doc_tree(metadata_by_code)
 
         pathlib.Path(self.args.path, "temp/").mkdir(
-            parents=True, exist_ok=True)
+            parents=True, exist_ok=True
+        )
 
         # Scan all docs for anchors
         for f in pathlib.Path(self.args.path).glob("*.html"):
             if f.name not in metadata_by_uri:
                 continue
             # Registering section links
-            with open(f, 'r') as reader:
+            with open(f, "r") as reader:
                 logging.debug(f"Scanning {f.name}")
                 content = reader.read()
                 soup = bs4.BeautifulSoup(content, "lxml")
-                for lnk in soup.body.find_all('a'):
+                for lnk in soup.body.find_all("a"):
                     if "name" in lnk.attrs and lnk.string is None:
                         anchor = lnk.attrs["name"]
-                        title = re.sub('[ _:]', '-', anchor)
+                        title = re.sub("[ _:]", "-", anchor)
                         res = dict(
-                            fname=f.name,
-                            title=title,
-                            replace=title.lower()
+                            fname=f.name, title=title, replace=title.lower()
                         )
                         self.doc_anchors[anchor] = res
                     if "href" in lnk.attrs and lnk["href"]:
@@ -377,20 +413,22 @@ class OTCDocConvertor:
             if f.name not in metadata_by_uri:
                 continue
             _target = metadata_by_uri[f.name]
-            target = _target['new_name']
+            target = _target["new_name"]
             target_path = self.get_target_path(
-                _target['p_code'], metadata_by_code)
+                _target["p_code"], metadata_by_code
+            )
             pathlib.Path(self.args.path, "temp").mkdir(
-                parents=True, exist_ok=True)
+                parents=True, exist_ok=True
+            )
             pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
-                parents=True, exist_ok=True)
-            pathlib.Path(dest, target_path).mkdir(
-                parents=True, exist_ok=True)
+                parents=True, exist_ok=True
+            )
+            pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)
 
             # Pre-processing of html content
-            with open(f, 'r') as reader, \
-                 open(pathlib.Path(self.args.path,
-                      f"temp/{target}.tmp"), 'w') as writer:
+            with open(f, "r") as reader, open(
+                pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
+            ) as writer:
                 # if f.name not in [
                 # ]:
                 #     continue
@@ -400,31 +438,28 @@ class OTCDocConvertor:
                 proc = self.streamline_html(soup, f.name)
 
                 for lnk in proc.find_all("a"):
-                    href = lnk.get('href')
-                    if href and not href.startswith('http'):
+                    href = lnk.get("href")
+                    if href and not href.startswith("http"):
                         # Internal link - replace with :ref:
-                        code = soup.new_tag('code')
-                        code['class'] = "interpreted-text"
-                        code['role'] = "ref"
-                        href_parts = href.split('#')
+                        code = soup.new_tag("code")
+                        code["class"] = "interpreted-text"
+                        code["role"] = "ref"
+                        href_parts = href.split("#")
                         if len(href_parts) > 1:
                             # for anchor just use anchor ref
                             link_target = href_parts[1].lower()
                         else:
                             # for other page - use only page name
-                            link_target = href_parts[0].replace(
-                                ".html", "").lower()
+                            link_target = (
+                                href_parts[0].replace(".html", "").lower()
+                            )
                         if link_target:
                             # Looks like an anchor on the same page
                             code.string = f"{lnk.string} <{link_target}>"
                             logging.debug(f" replace {lnk} with {code}")
                             lnk.replace_with(code)
 
-                # Drop parent link at the bottom of the page
-                for parent in proc.find_all("p", class_="parentlink"):
-                    parent.decompose()
-
-                logging.info(f'Saving file {writer.name}')
+                logging.info(f"Saving file {writer.name}")
                 writer.write(str(proc))
 
             # Convert html to rst
@@ -434,58 +469,84 @@ class OTCDocConvertor:
                 f"--ascii -s --wrap none"
             )
             # Post processing of rendered rst
-            with open(f"{self.args.path}/tmp_result/"
-                      f"{target_path}/{target}.rst", 'r') \
-                 as reader, \
-                 open(pathlib.Path(dest, target_path,
-                      f"{target}.rst"), 'w') as writer:
+            with open(
+                f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst",
+                "r",
+            ) as reader, open(
+                pathlib.Path(dest, target_path, f"{target}.rst"), "w"
+            ) as writer:
                 logging.info(f"Post processing {target}...")
                 writer.write(f":original_name: {f.name}\n\n")
                 # Add root file label
                 writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
                 # post process some usual stuff
                 for line in reader.readlines():
-                    processed_line = re.sub(r'\.\.\\_', '.. _', line)
-                    processed_line = re.sub(r'√', 'Y', processed_line)
+                    processed_line = re.sub(r"\.\.\\\\_", ".. _", line)
+                    processed_line = re.sub(r"\.\.\\_", ".. _", processed_line)
+                    # We could get unwanted anchors from pandoc - get rid of
+                    # them
+                    anchor = re.search(r"\.\. \_(.*):", processed_line)
+                    if anchor and len(anchor.groups()) > 0:
+                        if not self.is_element_referred(
+                            anchor.group(1), f.name
+                        ):
+                            # This is most likely some duplicated anchor. It is
+                            # not referred from any other place so drop it
+                            logging.info("Dropping not referred anchor")
+                            continue
+
+                    processed_line = re.sub(r"√", "Y", processed_line)
                     processed_line = re.sub(
-                        r'public_sys-resources/', '', processed_line)
+                        r"public_sys-resources/", "", processed_line
+                    )
                     processed_line = re.sub(
-                        r'   :name: .*$', '', processed_line)
+                        r"   :name: .*$", "", processed_line
+                    )
                     processed_line = re.sub(
-                        r'\*\*Parent topic:.*$', '', processed_line)
+                        r"\*\*Parent topic:.*$", "", processed_line
+                    )
                     processed_line = re.sub(
-                        r'.. code:: screen$',
-                        r'.. code-block::', processed_line)
+                        r".. code:: screen$",
+                        r".. code-block::",
+                        processed_line,
+                    )
                     for lexer in ["json", "bash", "text", "console"]:
                         processed_line = re.sub(
                             f".. code:: {lexer}$",
-                            f".. code-block:: {lexer}", processed_line)
+                            f".. code-block:: {lexer}",
+                            processed_line,
+                        )
                         if re.match(rf".. code:: {lexer}\s", processed_line):
                             logging.error(
                                 f"'code-block: {lexer}' with something "
-                                "afterwards")
+                                "afterwards"
+                            )
                             exit(1)
                     # spaces are important, since code-block may reside inside
                     # of the cell
                     processed_line = re.sub(
-                        r'.. code:: screen\s',
-                        r'.. code-block::  ', processed_line)
+                        r".. code:: screen\s",
+                        r".. code-block::  ",
+                        processed_line,
+                    )
                     processed_line = re.sub(
-                        r'.. code:: codeblock$',
-                        r'.. code-block::', processed_line)
-                    processed_line = re.sub(r'[ \t]*$', '', processed_line)
+                        r".. code:: codeblock$",
+                        r".. code-block::",
+                        processed_line,
+                    )
+                    processed_line = re.sub(r"[ \t]*$", "", processed_line)
                     writer.write(processed_line)
 
         # Generate indexes
         for k, v in tree.items():
-            path = ''
+            path = ""
             title = self.args.title
-            page_label = ''
+            page_label = ""
             if k != 0:
                 curr = metadata_by_code[k]
-                title = curr['title']
-                page_label = curr['uri'].replace(".html", "").lower()
-                path = self.get_target_path(curr['code'], metadata_by_code)
+                title = curr["title"]
+                page_label = curr["uri"].replace(".html", "").lower()
+                path = self.get_target_path(curr["code"], metadata_by_code)
 
             p = pathlib.Path(dest, f"{path}.rst")
             if p.exists():
@@ -495,50 +556,51 @@ class OTCDocConvertor:
                 # Update existing index file
                 p.rename(pathlib.Path(dest, f"{path}/index.rst"))
                 with open(pathlib.Path(dest, path, "index.rst"), "a") as index:
-                    index.write('\n')
-                    index.write('.. toctree::\n')
-                    index.write('   :maxdepth: 1\n')
-                    index.write('   :hidden: \n\n')
+                    index.write("\n")
+                    index.write(".. toctree::\n")
+                    index.write("   :maxdepth: 1\n")
+                    index.write("   :hidden: \n\n")
                     for child in v:
-                        new_name = child['new_name']
-                        if child['code'] in tree:
+                        new_name = child["new_name"]
+                        if child["code"] in tree:
                             # If this is folder - add /index
-                            new_name = new_name + '/index'
+                            new_name = new_name + "/index"
                         index.write(f"   {new_name}\n")
             else:
                 with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
                     # New index file
                     if page_label:
                         index.write(f".. _{page_label}:\n\n")
-                    index.write('=' * (len(title)) + '\n')
-                    index.write(title + '\n')
-                    index.write('=' * (len(title)) + '\n')
-                    index.write('\n')
-                    index.write('.. toctree::\n')
-                    index.write('   :maxdepth: 1\n\n')
+                    index.write("=" * (len(title)) + "\n")
+                    index.write(title + "\n")
+                    index.write("=" * (len(title)) + "\n")
+                    index.write("\n")
+                    index.write(".. toctree::\n")
+                    index.write("   :maxdepth: 1\n\n")
                     for child in v:
-                        new_name = child['new_name']
-                        if child['code'] in tree:
+                        new_name = child["new_name"]
+                        if child["code"] in tree:
                             # If this is folder - add /index
-                            new_name = new_name + '/index'
+                            new_name = new_name + "/index"
                         index.write(f"   {new_name}\n")
         # Copy used images
         if len(self.doc_images) > 0:
             logging.debug("Processing images...")
-            img_dest = pathlib.Path(dest, '_static', 'images')
+            img_dest = pathlib.Path(dest, "_static", "images")
             img_dest.mkdir(parents=True, exist_ok=True)
             for img in self.doc_images:
                 shutil.copyfile(
                     pathlib.Path(self.args.path, img).resolve(strict=False),
-                    pathlib.Path(
-                        img_dest, os.path.basename(img)).resolve(strict=False)
+                    pathlib.Path(img_dest, os.path.basename(img)).resolve(
+                        strict=False
+                    ),
                 )
 
         context = dict(
             title=self.args.title,
             project=self.args.service,
             repo_name=self.args.repo_name,
-            pdf_name=self.args.pdf_name
+            pdf_name=self.args.pdf_name,
         )
         loader = FileSystemLoader([self.args.templates_location])
         env = Environment(loader=loader, autoescape=select_autoescape())
@@ -547,9 +609,7 @@ class OTCDocConvertor:
             outfile_rendered = outfile_tmpl.render(**context)
             target_file = pathlib.Path(self.args.dest, f)
             target_file.parent.mkdir(parents=True, exist_ok=True)
-            with open(
-                target_file, 'w', encoding='utf-8', newline=''
-            ) as out:
+            with open(target_file, "w", encoding="utf-8", newline="") as out:
                 logging.debug(f"Generating {f} from template...")
                 out.write(outfile_rendered)
 
diff --git a/otc_doc_convertor/process.py b/otc_doc_convertor/process.py
deleted file mode 100644
index d2e5e76d7..000000000
--- a/otc_doc_convertor/process.py
+++ /dev/null
@@ -1,409 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import bs4
-import json
-import logging
-import os
-import pathlib
-import re
-
-
-class OTCDocConvertor:
-
-    def __init__(self):
-        self.doc_anchors = dict()
-        self.doc_links = dict()
-
-    @staticmethod
-    def get_new_name(current_name):
-        new_name = current_name.replace(' - ', '_')
-        new_name = new_name.replace(' ', '_')
-        new_name = new_name.replace('/', '_')
-        new_name = new_name.replace('\'', '')
-        new_name = new_name.replace('"', '')
-        new_name = new_name.replace('`', '')
-        new_name = new_name.replace('´', '')
-        new_name = new_name.replace(':', '')
-        new_name = new_name.replace('?', '')
-        new_name = new_name.replace('(', '')
-        new_name = new_name.replace(')', '')
-        new_name = new_name.lower()
-        return new_name
-
-    @staticmethod
-    def build_doc_tree(metadata):
-        flat_tree = dict()
-        for k, v in metadata.items():
-            parent_id = v.get('p_code')
-            if not parent_id:
-                parent_id = 0
-
-            if parent_id not in flat_tree:
-                flat_tree[parent_id] = list()
-            flat_tree[parent_id].append(v)
-        return flat_tree
-
-    @classmethod
-    def get_target_path(cls, code, metadata, path=''):
-        if code in metadata:
-            current = metadata[code]
-            if not current.get('p_code'):
-                return current['new_name']
-            else:
-                return (
-                    "{0}/{1}".format(
-                        cls.get_target_path(current['p_code'], metadata),
-                        current['new_name'])
-                )
-        else:
-            return ''
-
-    def make_label(self, soup, name):
-        label = soup.new_tag("p")
-        label.string = f"..\\_{name.lower()}:"
-        return label
-
-    def is_element_referred(self, ref, fname):
-        return (
-            ref in self.doc_links
-            or '#' + ref in self.doc_links
-            or fname + '#' + ref in self.doc_links
-        )
-
-    def streamline_html(self, soup, file_name):
-        # Drop eventual header duplicated anchors
-        fname = file_name.replace(".html", "").lower()
-        met_page_anchors = dict()
-        for lnk in soup.body.find_all("a"):
-            name = None
-            if "name" in lnk.attrs and lnk.string is None:
-                name = lnk.attrs["name"].lower()
-                if name in met_page_anchors:
-                    # Such anchor already existed on this page, drop it
-                    lnk.decompose()
-                met_page_anchors[name] = True
-
-            if name and name.lower() == fname:
-                lnk.decompose()
-
-        # Process divs
-        for i in soup.body.find_all('div'):
-            if "note" in i.get('class', []):
-                # Notes
-                del i['id']
-                if i.img:
-                    i.img.decompose()
-                notetitle = i.find('span', class_='notetitle')
-                if notetitle:
-                    title = soup.new_tag('div')
-                    title['class'] = 'title'
-                    title.string = 'Note:'
-                    notetitle.replace_with(title)
-            elif "notice" in i.get('class', []):
-                # Notices
-                del i['id']
-                if i.img:
-                    i.img.decompose()
-                i['class'] = 'important'
-            elif "caution" in i.get('class', []):
-                # Cautions
-                del i['id']
-                if i.img:
-                    i.img.decompose()
-            elif "fignone" in i.get('class', []):
-                # Figures
-                # When we found figure generate local label (anchor)
-                if i.get('id'):
-                    logging.debug('place figure label')
-                    i.insert_before(self.make_label(soup, i.get("id")))
-                figure = soup.new_tag('figure')
-                img = i.find('img')
-                cap = i.find('span', class_='figcap')
-                if cap is not None:
-                    cap.name = 'figcaption'
-                    figure.append(cap)
-                if img:
-                    img['src'] = '/_static/images/' + img['src']
-                    figure.append(img)
-                i.replace_with(figure)
-            elif "section" in i.get('class', []):
-                # Sections
-                # When we found section generate local label (anchor)
-                if i.get('id'):
-                    sec_id = i.get("id").lower()
-                    if self.is_element_referred(sec_id, file_name):
-                        logging.debug('Add section label')
-                        i.insert_before(self.make_label(soup, sec_id))
-                # and still convert to paragraph
-                i.name = 'p'
-            else:
-                i.name = 'p'
-
-        # Drop strong in table headers "/"
-        for th in soup.body.find_all('th'):
-            if th.p.strong:
-                th.p.strong.unwrap()
-
-        if self.args.improve_table_headers:
-            # Add spaces around "/"
-            for th in soup.body.find_all('th'):
-                if hasattr(th, 'p') and th.p.string:
-                    th.p.string = re.sub(
-                        r'\b/\b',
-                        ' / ',
-                        th.p.string)
-
-        # local anchors
-        for lnk in soup.body.find_all("a"):
-            if (
-                lnk.string is None
-                and hasattr(lnk, "name")
-                and not re.match(r"^li\d+$", lnk.attrs["name"])
-                # anywhere section
-                and not re.match(r".*section\d+$", lnk.attrs["name"])
-                # starts with table
-                and not re.match(r"^table\d+$", lnk.attrs["name"])
-            ):
-                # Verify this is really called from somewhere:
-                local_ref = lnk["name"].lower()
-                if self.is_element_referred(local_ref, file_name):
-                    # We now know something in the document wants this anchor -
-                    # replace it with label
-                    lnk.name = "p"
-                    lnk.string = f"..\\_{local_ref}:"
-                    del lnk["name"]
-                else:
-                    logging.debug("Dropping unreferred link")
-
-        for li in soup.body.find_all("li"):
-            del li['id']
-
-        for pre in soup.body.find_all("pre"):
-            text = pre.get_text()
-            # if text.startswith("{"):
-            #    pre["class"] = "data"
-            if re.search(
-                r'\[[a-z]*@\w+.*\][\s#>]?',
-                text
-            ):
-                # Something like "[root@ecs-test-0001 ~]#"
-                pre["class"] = "console"
-            elif re.match(
-                r'^(GET|PUT|POST|DELETE)',
-                text
-            ):
-                # Something like "DELETE https://some_url"
-                pre["class"] = "text"
-
-        # And now specialities
-        rawize_strings = [
-            # "\*\*\*\*\*\*",
-            # r"([\\\/\:\*\?\"\~|<>]{4,})"
-        ]
-        for to_rawize in rawize_strings:
-            for p in soup.body.find_all(string=re.compile(to_rawize)):
-                if p.string:
-                    curr = p.string
-                    part = re.search(to_rawize, curr)
-                    if len(part.groups()) > 0:
-                        new = curr.replace(
-                            part.group(1),
-                            f"<code>{part.group(1)}</code>"
-                        )
-                        p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
-                        print(part.group(1))
-                        print(f"New content is {p.string}")
-                logging.error(f"String with star: {p}")
-
-        return soup.body
-
-    def main(self):
-        logging.basicConfig(level=logging.DEBUG)
-        parser = argparse.ArgumentParser(description='Process links.')
-        parser.add_argument(
-            'path', type=str, help='path to the files')
-        parser.add_argument(
-            '--improve-table-headers', action='store_true',
-            help='Improve table headers by enforcing spaces around `/`')
-        parser.add_argument(
-            '--pygments-lexer',
-            help='Set particular code-block lexer language')
-        self.args = parser.parse_args()
-        retval = os.getcwd()
-        os.chdir(self.args.path)
-        meta_data = json.loads(open("CLASS.TXT.json").read())
-        metadata_by_uri = dict()
-        metadata_by_code = dict()
-
-        for f in meta_data:
-            f['new_name'] = self.get_new_name(f['title'])
-            metadata_by_uri[f['uri']] = f
-            metadata_by_code[f.get('code')] = f
-
-        tree = self.build_doc_tree(metadata_by_code)
-
-        pathlib.Path("temp/").mkdir(parents=True, exist_ok=True)
-
-        # Scan all docs for anchors
-        for f in pathlib.Path().glob("*.html"):
-            if f.name not in metadata_by_uri:
-                continue
-            # Registering section links
-            with open(f, 'r') as reader:
-                logging.debug(f"Scanning {f.name}")
-                content = reader.read()
-                soup = bs4.BeautifulSoup(content, "lxml")
-                for lnk in soup.body.find_all('a'):
-                    if "name" in lnk.attrs and lnk.string is None:
-                        anchor = lnk.attrs["name"]
-                        title = re.sub('[ _:]', '-', anchor)
-                        res = dict(
-                            fname=f.name,
-                            title=title,
-                            replace=title.lower()
-                        )
-                        self.doc_anchors[anchor] = res
-                    if "href" in lnk.attrs and lnk["href"]:
-                        self.doc_links[lnk["href"].lower()] = f.name
-
-        for f in pathlib.Path().glob("*.html"):
-            if f.name not in metadata_by_uri:
-                continue
-            _target = metadata_by_uri[f.name]
-            target = _target['new_name']
-            target_path = self.get_target_path(
-                _target['p_code'], metadata_by_code)
-            pathlib.Path("temp/").mkdir(parents=True, exist_ok=True)
-            pathlib.Path("tmp_result/" + target_path).mkdir(
-                parents=True, exist_ok=True)
-            pathlib.Path("result/" + target_path).mkdir(
-                parents=True, exist_ok=True)
-
-            # Pre-processing of html content
-            with open(f, 'r') as reader, \
-                 open(f"temp/{target}.tmp", 'w') as writer:
-                # if f.name not in [
-                #         "modelarts_21_0031.html",
-                #         "en-us_topic_0032380449.html"]:
-                #     continue
-                logging.info(f"Pre-Processing {f} as {target}")
-                content = reader.read()
-                soup = bs4.BeautifulSoup(content, "lxml")
-                proc = self.streamline_html(soup, f.name)
-
-                for lnk in proc.find_all("a"):
-                    href = lnk.get('href')
-                    if href and not href.startswith('http'):
-                        # Internal link - replace with :ref:
-                        code = soup.new_tag('code')
-                        code['class'] = "interpreted-text"
-                        code['role'] = "ref"
-                        href_parts = href.split('#')
-                        if len(href_parts) > 1:
-                            # for anchor just use anchor ref
-                            link_target = href_parts[1].lower()
-                        else:
-                            # for other page - use only page name
-                            link_target = href_parts[0].replace(
-                                ".html", "").lower()
-                        if link_target:
-                            # Looks like an anchor on the same page
-                            code.string = f"{lnk.string} <{link_target}>"
-                            logging.debug(f" replace {lnk} with {code}")
-                            lnk.replace_with(code)
-
-                # Drop parent link at the bottom of the page
-                for parent in proc.find_all("p", class_="parentlink"):
-                    parent.decompose()
-
-                logging.info(f'Saving file {writer.name}')
-                writer.write(str(proc))
-
-            # Convert html to rst
-            os.system(
-                f"pandoc 'temp/{target}.tmp' -f html "
-                f"-o 'tmp_result/{target_path}/{target}.rst' "
-                f"--ascii -s --wrap none"
-            )
-            # Post processing of rendered rst
-            with open(f"tmp_result/{target_path}/{target}.rst", 'r') \
-                 as reader, \
-                 open(f"result/{target_path}/{target}.rst", 'w') as writer:
-                logging.info(f"Post processing {target}")
-                writer.write(f":original_name: {f.name}\n\n")
-                # Add root file label
-                writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
-                # post process some usual stuff
-                for line in reader.readlines():
-                    processed_line = re.sub(r'\.\.\\_', '.. _', line)
-                    processed_line = re.sub(r'√', 'Y', processed_line)
-                    processed_line = re.sub(
-                        r'public_sys-resources/', '', processed_line)
-                    processed_line = re.sub(
-                        r'image:: ', 'image:: /_static/images/',
-                        processed_line)
-                    processed_line = re.sub(
-                        r'   :name: .*$', '', processed_line)
-                    processed_line = re.sub(
-                        r'\*\*Parent topic:.*$', '', processed_line)
-                    processed_line = re.sub(
-                        r'.. code:: screen$',
-                        r'.. code-block::', processed_line)
-                    for lexer in ["json", "bash", "text"]:
-                        processed_line = re.sub(
-                            f".. code:: {lexer}$",
-                            f".. code-block:: {lexer}", processed_line)
-                        if re.match(rf".. code:: {lexer}\s", processed_line):
-                            logging.error(
-                                f"'code-block: {lexer}' with something "
-                                "afterwards")
-                            exit(1)
-                    # spaces are important, since code-block may reside inside
-                    # of the cell
-                    processed_line = re.sub(
-                        r'.. code:: screen\s',
-                        r'.. code-block::  ', processed_line)
-                    processed_line = re.sub(
-                        r'.. code:: codeblock$',
-                        r'.. code-block::', processed_line)
-                    writer.write(processed_line)
-
-        # Generate indexes
-        for k, v in tree.items():
-            path = ''
-            title = 'Main Index'
-            page_label = ''
-            if k != 0:
-                curr = metadata_by_code[k]
-                title = curr['title']
-                page_label = curr['uri'].replace(".html", "").lower()
-                path = self.get_target_path(curr['code'], metadata_by_code)
-            with open(f"result/{path}/index.rst", "w") as index:
-                if page_label:
-                    index.write(f".. _{page_label}:\n\n")
-                index.write('=' * (len(title)) + '\n')
-                index.write(title + '\n')
-                index.write('=' * (len(title)) + '\n')
-                index.write('\n')
-                index.write('.. toctree::\n')
-                index.write('   :maxdepth: 1\n\n')
-                for child in v:
-                    new_name = child['new_name']
-                    if child['code'] in tree:
-                        # If this is folder - add /index
-                        new_name = new_name + '/index'
-                    index.write(f"   {new_name}\n")
-
-            p = pathlib.Path(f"result/{path}.rst")
-            if p.exists():
-                logging.warning(
-                    f"{p.resolve()} is removed in favour"
-                    f" of result/{path}/index.rst")
-                p.unlink()
-
-        os.chdir(retval)
-
-
-if __name__ == "__main__":
-    OTCDocConvertor().main()