#!/usr/bin/env python3 import argparse import bs4 import json import logging import os import pathlib import re import shutil from jinja2 import FileSystemLoader, Environment, select_autoescape class OTCDocConvertor: def __init__(self): self.doc_anchors = dict() self.doc_links = dict() @staticmethod def get_new_name(current_name): new_name = current_name.replace(" - ", "_") new_name = new_name.replace(" ", "_") new_name = new_name.replace("/", "_") new_name = new_name.replace("'", "") new_name = new_name.replace('"', "") new_name = new_name.replace("`", "") new_name = new_name.replace("´", "") new_name = new_name.replace(":", "") new_name = new_name.replace("?", "") new_name = new_name.replace("(", "") new_name = new_name.replace(")", "") new_name = new_name.lower() return new_name @staticmethod def build_doc_tree(metadata): flat_tree = dict() for k, v in metadata.items(): parent_id = v.get("p_code") if not parent_id: parent_id = 0 if parent_id not in flat_tree: flat_tree[parent_id] = list() flat_tree[parent_id].append(v) return flat_tree @classmethod def get_target_path(cls, code, metadata): if code in metadata: current = metadata[code] if not current.get("p_code"): return current["new_name"] else: return "{0}/{1}".format( cls.get_target_path(current["p_code"], metadata), current["new_name"], ) else: return "" def make_label(self, soup, name): label = soup.new_tag("p") label.string = f"..\\_{name.lower()}:" return label def is_element_referred(self, ref, fname): return ( ref in self.doc_links or "#" + ref in self.doc_links or fname + "#" + ref in self.doc_links ) def streamline_html(self, soup, file_name): # Drop eventual header duplicated anchors fname = file_name.replace(".html", "").lower() page_anchors = set() met_page_anchors = dict() for lnk in soup.body.find_all("a"): name = None if "name" in lnk.attrs and lnk.string is None: name = lnk.attrs["name"].lower() if name in met_page_anchors: # Such anchor already existed on this page, drop it lnk.decompose() met_page_anchors[name] = True if name and name.lower() == fname: lnk.decompose() # Process divs for i in soup.body.find_all("div"): if "note" in i.get("class", []): # Notes del i["id"] if i.img: i.img.decompose() notetitle = i.find("span", class_="notetitle") if notetitle: title = soup.new_tag("div") title["class"] = "title" title.string = "Note:" notetitle.replace_with(title) elif "warning" in i.get("class", []): # Warnings del i["id"] if i.img: i.img.decompose() eltitle = i.find("span", class_="warningtitle") if eltitle: title = soup.new_tag("div") title["class"] = "title" title.string = "Warning:" eltitle.replace_with(title) elif "notice" in i.get("class", []): # Notices del i["id"] if i.img: i.img.decompose() i["class"] = "important" elif "caution" in i.get("class", []): # Cautions del i["id"] if i.img: i.img.decompose() elif "fignone" in i.get("class", []): # Figures # When we found figure generate local label (anchor) if i.get("id"): logging.debug("place figure label") i.insert_before(self.make_label(soup, i.get("id"))) figure = soup.new_tag("figure") img = i.find("img") cap = i.find("span", class_="figcap") if cap is not None: cap.name = "figcaption" figure.append(cap) if img: # Store all referred images for copying self.doc_images.add(img["src"]) img["src"] = ( "/_static/images/" + os.path.basename(img["src"]) ) del img["width"] del img["height"] del img["class"] del img["title"] del img["name"] del img["id"] figure.append(img) i.replace_with(figure) elif "section" in i.get("class", []): # Sections if i.get("id"): # When we found section generate local label (anchor) sec_id = i.get("id").lower() if self.is_element_referred(sec_id, file_name): page_anchors.add(sec_id) i.insert_before(self.make_label(soup, sec_id)) i.unwrap() elif i.get("id") and i.get("id").startswith("body"): i.unwrap() else: i.name = "p" # Process remaining images for img in soup.body.find_all("img"): if img["src"] and not img["src"].startswith("/_static/images"): self.doc_images.add(img["src"]) img["src"] = "/_static/images/" + os.path.basename(img["src"]) del img["width"] del img["height"] del img["class"] del img["title"] del img["id"] # Drop strong in table headers "/" for th in soup.body.find_all("th"): if th.p.strong: th.p.strong.unwrap() if self.args.improve_table_headers: # Add spaces around "/" for th in soup.body.find_all("th"): if hasattr(th, "p") and th.p.string: th.p.string = re.sub(r"\b/\b", " / ", th.p.string) # Drop strong around links "/" for strong in soup.body.find_all("strong"): if strong.a: strong.unwrap() # table anchors - some tables are referred. Some are having anchor in # front, some not. In order to cope with that we analyze every table # and if it is referred - prepend anchor. Next anchor processing will # skiip it, since such anchor is already placed on the page for table in soup.body.find_all("table"): # Verify this is really called from somewhere: if table.get("id"): local_ref = table["id"].lower() if self.is_element_referred(local_ref, file_name): # We now know something in the document wants this anchor - # replace it with label if local_ref not in page_anchors: lnk = bs4.BeautifulSoup( f"
..\\_{local_ref}:
", "html.parser" ) table.insert_before(lnk) page_anchors.add(local_ref) else: logging.debug( "Not placing replaced anchor %s " " since it already existed", local_ref, ) # local anchors for lnk in soup.body.find_all("a"): if ( lnk.string is None and hasattr(lnk, "name") and not re.match(r"^li\d+$", lnk.attrs["name"]) # anywhere section and not re.match(r".*section\d+$", lnk.attrs["name"]) # starts with table and not re.match(r"^table\d+$", lnk.attrs["name"]) ): # Verify this is really called from somewhere: local_ref = lnk["name"].lower() if self.is_element_referred(local_ref, file_name): # We now know something in the document wants this anchor - # replace it with label if local_ref not in page_anchors: logging.debug("Adding anchor") lnk.name = "p" lnk.string = f"..\\_{local_ref}:" del lnk["name"] page_anchors.add(local_ref) else: logging.debug( "Not placing replaced anchor %s " " since it already existed", local_ref, ) else: logging.debug("Dropping unreferred link %s", lnk) # Undeline element should not be used at all for underline in soup.body.find_all("u"): underline.unwrap() for li in soup.body.find_all("li"): del li["id"] # Sometimes we have code blocks with line numbers. #1 # 2 # 3 | ....
# {em.string}" em.replace_with(bs4.BeautifulSoup(new, "html.parser")) escape_asterisk_re = r"\((\*)[\.,]" for p in soup.body.find_all(string=re.compile(escape_asterisk_re)): if p.string: curr = p.string part = re.search(escape_asterisk_re, curr) # If we have ` all files (*.*)` - no need to escape if ( len(part.groups()) > 0 and p.parent.name not in ["b", "strong"] ): logging.debug( "Found asterisks to escape: %s", part.group(1) ) new = curr.replace( part.group(1), f" |