@@ -239,22 +262,31 @@ class OTCDocConvertor:
text = pre.get_text()
# if text.startswith("{"):
# pre["class"] = "data"
- if re.search(
- r'\[[a-z]*@\w+.*\][\s#>]?',
- text
- ):
+ if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text):
# Something like "[root@ecs-test-0001 ~]#"
pre["class"] = "console"
- elif re.match(
- r'^(GET|PUT|POST|DELETE)',
- text
- ):
+ elif re.match(r"^(GET|PUT|POST|DELETE)", text):
# Something like "DELETE https://some_url"
pre["class"] = "text"
if "codeblock" in pre.get("class", []):
# all files (*.*)` - no need to escape
+ if len(part.groups()) > 0 and p.parent.name != "b":
+ logging.debug(
+ "Found asterisks to escape: %s", part.group(1)
+ )
+ new = curr.replace(
+ part.group(1), f"{part.group(1)}"
+ )
+ p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
+
# And now specialities
rawize_strings = [
# "\*\*\*\*\*\*",
@@ -265,6 +297,8 @@ class OTCDocConvertor:
r"(\*name)",
# DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
r"\(([\W\x60_]{10,})\)",
+ # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
+ r"\s([^a-zA-Z0-9\s]{10,})",
]
for to_rawize in rawize_strings:
for p in soup.body.find_all(string=re.compile(to_rawize)):
@@ -272,102 +306,104 @@ class OTCDocConvertor:
curr = p.string
part = re.search(to_rawize, curr)
if len(part.groups()) > 0:
- new = curr.replace(
- part.group(1),
- f"{part.group(1)}"
+ logging.debug(
+ "Found element to rawize: %s", part.group(1)
)
- p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
- print(part.group(1))
- print(f"New content is {p.string}")
+ new = curr.replace(
+ part.group(1), f"{part.group(1)}"
+ )
+ logging.debug("Replacing string with: %s", new)
+ p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
+ logging.debug("Replacing string with: %s", p.string)
else:
- print('ups')
+ logging.error(
+ "Cannot find string for rawization anymore"
+ )
logging.error(f"String with star: {p}")
+ # Drop parent link at the bottom of the page
+ for parent in soup.body.find_all("p", class_="familylinks"):
+ parent.decompose()
+
return soup.body
def main(self):
logging.basicConfig(level=logging.DEBUG)
- parser = argparse.ArgumentParser(description='Process links.')
+ parser = argparse.ArgumentParser(description="Process links.")
+ parser.add_argument("path", type=str, help="path to the files")
parser.add_argument(
- 'path', type=str, help='path to the files')
+ "--improve-table-headers",
+ action="store_true",
+ help="Improve table headers by enforcing spaces around `/`",
+ )
parser.add_argument(
- '--improve-table-headers', action='store_true',
- help='Improve table headers by enforcing spaces around `/`')
+ "--pygments-lexer", help="Set particular code-block lexer language"
+ )
parser.add_argument(
- '--pygments-lexer',
- help='Set particular code-block lexer language')
+ "--dest", help="Directory to write resulting files"
+ )
+ parser.add_argument("--title", required=True, help="Document title")
parser.add_argument(
- '--dest',
- help='Directory to write resulting files')
+ "--service", help="Service to which the document belongs to"
+ )
+ parser.add_argument("--repo-name", help="Service repository")
+ parser.add_argument("--pdf-name", help="PDF File name")
parser.add_argument(
- '--title',
- required=True,
- help='Document title')
- parser.add_argument(
- '--service',
- help='Service to which the document belongs to')
- parser.add_argument(
- '--repo-name',
- help='Service repository')
- parser.add_argument(
- '--pdf-name',
- help='PDF File name')
- parser.add_argument(
- '--templates-location',
- default='templates',
- help='Location of additional templates')
+ "--templates-location",
+ default="templates",
+ help="Location of additional templates",
+ )
self.args = parser.parse_args()
if self.args.dest:
dest = pathlib.Path(self.args.dest)
else:
- dest = pathlib.Path(self.args.path, 'result')
+ dest = pathlib.Path(self.args.path, "result")
dest.mkdir(parents=True, exist_ok=True)
- metadata_file = pathlib.Path(
- self.args.path, "CLASS.TXT.json")
+ metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
meta_data = dict()
if not metadata_file.exists():
logging.warning(
f"CLASS.TXT.json file is missing in {self.args.path}, "
- f"assuming initial import")
+ f"assuming initial import"
+ )
with open(pathlib.Path(dest, "index.rst"), "w") as index:
- index.write('=' * (len(self.args.title)) + '\n')
- index.write(self.args.title + '\n')
- index.write('=' * (len(self.args.title)) + '\n')
- index.write('\n')
+ index.write("=" * (len(self.args.title)) + "\n")
+ index.write(self.args.title + "\n")
+ index.write("=" * (len(self.args.title)) + "\n")
+ index.write("\n")
else:
meta_data = json.loads(open(metadata_file).read())
metadata_by_uri = dict()
metadata_by_code = dict()
self.doc_images = set()
for f in meta_data:
- f['new_name'] = self.get_new_name(f['title'])
- metadata_by_uri[f['uri']] = f
- metadata_by_code[f.get('code')] = f
+ f["new_name"] = self.get_new_name(f["title"])
+ metadata_by_uri[f["uri"]] = f
+ metadata_by_code[f.get("code")] = f
tree = self.build_doc_tree(metadata_by_code)
pathlib.Path(self.args.path, "temp/").mkdir(
- parents=True, exist_ok=True)
+ parents=True, exist_ok=True
+ )
# Scan all docs for anchors
for f in pathlib.Path(self.args.path).glob("*.html"):
if f.name not in metadata_by_uri:
continue
# Registering section links
- with open(f, 'r') as reader:
+ with open(f, "r") as reader:
logging.debug(f"Scanning {f.name}")
content = reader.read()
soup = bs4.BeautifulSoup(content, "lxml")
- for lnk in soup.body.find_all('a'):
+ for lnk in soup.body.find_all("a"):
if "name" in lnk.attrs and lnk.string is None:
anchor = lnk.attrs["name"]
- title = re.sub('[ _:]', '-', anchor)
+ title = re.sub("[ _:]", "-", anchor)
res = dict(
- fname=f.name,
- title=title,
- replace=title.lower()
+ fname=f.name, title=title, replace=title.lower()
)
self.doc_anchors[anchor] = res
if "href" in lnk.attrs and lnk["href"]:
@@ -377,20 +413,22 @@ class OTCDocConvertor:
if f.name not in metadata_by_uri:
continue
_target = metadata_by_uri[f.name]
- target = _target['new_name']
+ target = _target["new_name"]
target_path = self.get_target_path(
- _target['p_code'], metadata_by_code)
+ _target["p_code"], metadata_by_code
+ )
pathlib.Path(self.args.path, "temp").mkdir(
- parents=True, exist_ok=True)
+ parents=True, exist_ok=True
+ )
pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
- parents=True, exist_ok=True)
- pathlib.Path(dest, target_path).mkdir(
- parents=True, exist_ok=True)
+ parents=True, exist_ok=True
+ )
+ pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)
# Pre-processing of html content
- with open(f, 'r') as reader, \
- open(pathlib.Path(self.args.path,
- f"temp/{target}.tmp"), 'w') as writer:
+ with open(f, "r") as reader, open(
+ pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
+ ) as writer:
# if f.name not in [
# ]:
# continue
@@ -400,31 +438,28 @@ class OTCDocConvertor:
proc = self.streamline_html(soup, f.name)
for lnk in proc.find_all("a"):
- href = lnk.get('href')
- if href and not href.startswith('http'):
+ href = lnk.get("href")
+ if href and not href.startswith("http"):
# Internal link - replace with :ref:
- code = soup.new_tag('code')
- code['class'] = "interpreted-text"
- code['role'] = "ref"
- href_parts = href.split('#')
+ code = soup.new_tag("code")
+ code["class"] = "interpreted-text"
+ code["role"] = "ref"
+ href_parts = href.split("#")
if len(href_parts) > 1:
# for anchor just use anchor ref
link_target = href_parts[1].lower()
else:
# for other page - use only page name
- link_target = href_parts[0].replace(
- ".html", "").lower()
+ link_target = (
+ href_parts[0].replace(".html", "").lower()
+ )
if link_target:
# Looks like an anchor on the same page
code.string = f"{lnk.string} <{link_target}>"
logging.debug(f" replace {lnk} with {code}")
lnk.replace_with(code)
- # Drop parent link at the bottom of the page
- for parent in proc.find_all("p", class_="parentlink"):
- parent.decompose()
-
- logging.info(f'Saving file {writer.name}')
+ logging.info(f"Saving file {writer.name}")
writer.write(str(proc))
# Convert html to rst
@@ -434,58 +469,84 @@ class OTCDocConvertor:
f"--ascii -s --wrap none"
)
# Post processing of rendered rst
- with open(f"{self.args.path}/tmp_result/"
- f"{target_path}/{target}.rst", 'r') \
- as reader, \
- open(pathlib.Path(dest, target_path,
- f"{target}.rst"), 'w') as writer:
+ with open(
+ f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst",
+ "r",
+ ) as reader, open(
+ pathlib.Path(dest, target_path, f"{target}.rst"), "w"
+ ) as writer:
logging.info(f"Post processing {target}...")
writer.write(f":original_name: {f.name}\n\n")
# Add root file label
writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
# post process some usual stuff
for line in reader.readlines():
- processed_line = re.sub(r'\.\.\\_', '.. _', line)
- processed_line = re.sub(r'√', 'Y', processed_line)
+ processed_line = re.sub(r"\.\.\\\\_", ".. _", line)
+ processed_line = re.sub(r"\.\.\\_", ".. _", processed_line)
+ # We could get unwanted anchors from pandoc - get rid of
+ # them
+ anchor = re.search(r"\.\. \_(.*):", processed_line)
+ if anchor and len(anchor.groups()) > 0:
+ if not self.is_element_referred(
+ anchor.group(1), f.name
+ ):
+ # This is most likely some duplicated anchor. It is
+ # not referred from any other place so drop it
+ logging.info("Dropping not referred anchor")
+ continue
+
+ processed_line = re.sub(r"√", "Y", processed_line)
processed_line = re.sub(
- r'public_sys-resources/', '', processed_line)
+ r"public_sys-resources/", "", processed_line
+ )
processed_line = re.sub(
- r' :name: .*$', '', processed_line)
+ r" :name: .*$", "", processed_line
+ )
processed_line = re.sub(
- r'\*\*Parent topic:.*$', '', processed_line)
+ r"\*\*Parent topic:.*$", "", processed_line
+ )
processed_line = re.sub(
- r'.. code:: screen$',
- r'.. code-block::', processed_line)
+ r".. code:: screen$",
+ r".. code-block::",
+ processed_line,
+ )
for lexer in ["json", "bash", "text", "console"]:
processed_line = re.sub(
f".. code:: {lexer}$",
- f".. code-block:: {lexer}", processed_line)
+ f".. code-block:: {lexer}",
+ processed_line,
+ )
if re.match(rf".. code:: {lexer}\s", processed_line):
logging.error(
f"'code-block: {lexer}' with something "
- "afterwards")
+ "afterwards"
+ )
exit(1)
# spaces are important, since code-block may reside inside
# of the cell
processed_line = re.sub(
- r'.. code:: screen\s',
- r'.. code-block:: ', processed_line)
+ r".. code:: screen\s",
+ r".. code-block:: ",
+ processed_line,
+ )
processed_line = re.sub(
- r'.. code:: codeblock$',
- r'.. code-block::', processed_line)
- processed_line = re.sub(r'[ \t]*$', '', processed_line)
+ r".. code:: codeblock$",
+ r".. code-block::",
+ processed_line,
+ )
+ processed_line = re.sub(r"[ \t]*$", "", processed_line)
writer.write(processed_line)
# Generate indexes
for k, v in tree.items():
- path = ''
+ path = ""
title = self.args.title
- page_label = ''
+ page_label = ""
if k != 0:
curr = metadata_by_code[k]
- title = curr['title']
- page_label = curr['uri'].replace(".html", "").lower()
- path = self.get_target_path(curr['code'], metadata_by_code)
+ title = curr["title"]
+ page_label = curr["uri"].replace(".html", "").lower()
+ path = self.get_target_path(curr["code"], metadata_by_code)
p = pathlib.Path(dest, f"{path}.rst")
if p.exists():
@@ -495,50 +556,51 @@ class OTCDocConvertor:
# Update existing index file
p.rename(pathlib.Path(dest, f"{path}/index.rst"))
with open(pathlib.Path(dest, path, "index.rst"), "a") as index:
- index.write('\n')
- index.write('.. toctree::\n')
- index.write(' :maxdepth: 1\n')
- index.write(' :hidden: \n\n')
+ index.write("\n")
+ index.write(".. toctree::\n")
+ index.write(" :maxdepth: 1\n")
+ index.write(" :hidden: \n\n")
for child in v:
- new_name = child['new_name']
- if child['code'] in tree:
+ new_name = child["new_name"]
+ if child["code"] in tree:
# If this is folder - add /index
- new_name = new_name + '/index'
+ new_name = new_name + "/index"
index.write(f" {new_name}\n")
else:
with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
# New index file
if page_label:
index.write(f".. _{page_label}:\n\n")
- index.write('=' * (len(title)) + '\n')
- index.write(title + '\n')
- index.write('=' * (len(title)) + '\n')
- index.write('\n')
- index.write('.. toctree::\n')
- index.write(' :maxdepth: 1\n\n')
+ index.write("=" * (len(title)) + "\n")
+ index.write(title + "\n")
+ index.write("=" * (len(title)) + "\n")
+ index.write("\n")
+ index.write(".. toctree::\n")
+ index.write(" :maxdepth: 1\n\n")
for child in v:
- new_name = child['new_name']
- if child['code'] in tree:
+ new_name = child["new_name"]
+ if child["code"] in tree:
# If this is folder - add /index
- new_name = new_name + '/index'
+ new_name = new_name + "/index"
index.write(f" {new_name}\n")
# Copy used images
if len(self.doc_images) > 0:
logging.debug("Processing images...")
- img_dest = pathlib.Path(dest, '_static', 'images')
+ img_dest = pathlib.Path(dest, "_static", "images")
img_dest.mkdir(parents=True, exist_ok=True)
for img in self.doc_images:
shutil.copyfile(
pathlib.Path(self.args.path, img).resolve(strict=False),
- pathlib.Path(
- img_dest, os.path.basename(img)).resolve(strict=False)
+ pathlib.Path(img_dest, os.path.basename(img)).resolve(
+ strict=False
+ ),
)
context = dict(
title=self.args.title,
project=self.args.service,
repo_name=self.args.repo_name,
- pdf_name=self.args.pdf_name
+ pdf_name=self.args.pdf_name,
)
loader = FileSystemLoader([self.args.templates_location])
env = Environment(loader=loader, autoescape=select_autoescape())
@@ -547,9 +609,7 @@ class OTCDocConvertor:
outfile_rendered = outfile_tmpl.render(**context)
target_file = pathlib.Path(self.args.dest, f)
target_file.parent.mkdir(parents=True, exist_ok=True)
- with open(
- target_file, 'w', encoding='utf-8', newline=''
- ) as out:
+ with open(target_file, "w", encoding="utf-8", newline="") as out:
logging.debug(f"Generating {f} from template...")
out.write(outfile_rendered)
diff --git a/otc_doc_convertor/process.py b/otc_doc_convertor/process.py
deleted file mode 100644
index d2e5e76d7..000000000
--- a/otc_doc_convertor/process.py
+++ /dev/null
@@ -1,409 +0,0 @@
-#!/usr/bin/env python3
-
-import argparse
-import bs4
-import json
-import logging
-import os
-import pathlib
-import re
-
-
-class OTCDocConvertor:
-
- def __init__(self):
- self.doc_anchors = dict()
- self.doc_links = dict()
-
- @staticmethod
- def get_new_name(current_name):
- new_name = current_name.replace(' - ', '_')
- new_name = new_name.replace(' ', '_')
- new_name = new_name.replace('/', '_')
- new_name = new_name.replace('\'', '')
- new_name = new_name.replace('"', '')
- new_name = new_name.replace('`', '')
- new_name = new_name.replace('´', '')
- new_name = new_name.replace(':', '')
- new_name = new_name.replace('?', '')
- new_name = new_name.replace('(', '')
- new_name = new_name.replace(')', '')
- new_name = new_name.lower()
- return new_name
-
- @staticmethod
- def build_doc_tree(metadata):
- flat_tree = dict()
- for k, v in metadata.items():
- parent_id = v.get('p_code')
- if not parent_id:
- parent_id = 0
-
- if parent_id not in flat_tree:
- flat_tree[parent_id] = list()
- flat_tree[parent_id].append(v)
- return flat_tree
-
- @classmethod
- def get_target_path(cls, code, metadata, path=''):
- if code in metadata:
- current = metadata[code]
- if not current.get('p_code'):
- return current['new_name']
- else:
- return (
- "{0}/{1}".format(
- cls.get_target_path(current['p_code'], metadata),
- current['new_name'])
- )
- else:
- return ''
-
- def make_label(self, soup, name):
- label = soup.new_tag("p")
- label.string = f"..\\_{name.lower()}:"
- return label
-
- def is_element_referred(self, ref, fname):
- return (
- ref in self.doc_links
- or '#' + ref in self.doc_links
- or fname + '#' + ref in self.doc_links
- )
-
- def streamline_html(self, soup, file_name):
- # Drop eventual header duplicated anchors
- fname = file_name.replace(".html", "").lower()
- met_page_anchors = dict()
- for lnk in soup.body.find_all("a"):
- name = None
- if "name" in lnk.attrs and lnk.string is None:
- name = lnk.attrs["name"].lower()
- if name in met_page_anchors:
- # Such anchor already existed on this page, drop it
- lnk.decompose()
- met_page_anchors[name] = True
-
- if name and name.lower() == fname:
- lnk.decompose()
-
- # Process divs
- for i in soup.body.find_all('div'):
- if "note" in i.get('class', []):
- # Notes
- del i['id']
- if i.img:
- i.img.decompose()
- notetitle = i.find('span', class_='notetitle')
- if notetitle:
- title = soup.new_tag('div')
- title['class'] = 'title'
- title.string = 'Note:'
- notetitle.replace_with(title)
- elif "notice" in i.get('class', []):
- # Notices
- del i['id']
- if i.img:
- i.img.decompose()
- i['class'] = 'important'
- elif "caution" in i.get('class', []):
- # Cautions
- del i['id']
- if i.img:
- i.img.decompose()
- elif "fignone" in i.get('class', []):
- # Figures
- # When we found figure generate local label (anchor)
- if i.get('id'):
- logging.debug('place figure label')
- i.insert_before(self.make_label(soup, i.get("id")))
- figure = soup.new_tag('figure')
- img = i.find('img')
- cap = i.find('span', class_='figcap')
- if cap is not None:
- cap.name = 'figcaption'
- figure.append(cap)
- if img:
- img['src'] = '/_static/images/' + img['src']
- figure.append(img)
- i.replace_with(figure)
- elif "section" in i.get('class', []):
- # Sections
- # When we found section generate local label (anchor)
- if i.get('id'):
- sec_id = i.get("id").lower()
- if self.is_element_referred(sec_id, file_name):
- logging.debug('Add section label')
- i.insert_before(self.make_label(soup, sec_id))
- # and still convert to paragraph
- i.name = 'p'
- else:
- i.name = 'p'
-
- # Drop strong in table headers "/"
- for th in soup.body.find_all('th'):
- if th.p.strong:
- th.p.strong.unwrap()
-
- if self.args.improve_table_headers:
- # Add spaces around "/"
- for th in soup.body.find_all('th'):
- if hasattr(th, 'p') and th.p.string:
- th.p.string = re.sub(
- r'\b/\b',
- ' / ',
- th.p.string)
-
- # local anchors
- for lnk in soup.body.find_all("a"):
- if (
- lnk.string is None
- and hasattr(lnk, "name")
- and not re.match(r"^li\d+$", lnk.attrs["name"])
- # anywhere section
- and not re.match(r".*section\d+$", lnk.attrs["name"])
- # starts with table
- and not re.match(r"^table\d+$", lnk.attrs["name"])
- ):
- # Verify this is really called from somewhere:
- local_ref = lnk["name"].lower()
- if self.is_element_referred(local_ref, file_name):
- # We now know something in the document wants this anchor -
- # replace it with label
- lnk.name = "p"
- lnk.string = f"..\\_{local_ref}:"
- del lnk["name"]
- else:
- logging.debug("Dropping unreferred link")
-
- for li in soup.body.find_all("li"):
- del li['id']
-
- for pre in soup.body.find_all("pre"):
- text = pre.get_text()
- # if text.startswith("{"):
- # pre["class"] = "data"
- if re.search(
- r'\[[a-z]*@\w+.*\][\s#>]?',
- text
- ):
- # Something like "[root@ecs-test-0001 ~]#"
- pre["class"] = "console"
- elif re.match(
- r'^(GET|PUT|POST|DELETE)',
- text
- ):
- # Something like "DELETE https://some_url"
- pre["class"] = "text"
-
- # And now specialities
- rawize_strings = [
- # "\*\*\*\*\*\*",
- # r"([\\\/\:\*\?\"\~|<>]{4,})"
- ]
- for to_rawize in rawize_strings:
- for p in soup.body.find_all(string=re.compile(to_rawize)):
- if p.string:
- curr = p.string
- part = re.search(to_rawize, curr)
- if len(part.groups()) > 0:
- new = curr.replace(
- part.group(1),
- f"{part.group(1)}"
- )
- p.replace_with(bs4.BeautifulSoup(new, 'html.parser'))
- print(part.group(1))
- print(f"New content is {p.string}")
- logging.error(f"String with star: {p}")
-
- return soup.body
-
- def main(self):
- logging.basicConfig(level=logging.DEBUG)
- parser = argparse.ArgumentParser(description='Process links.')
- parser.add_argument(
- 'path', type=str, help='path to the files')
- parser.add_argument(
- '--improve-table-headers', action='store_true',
- help='Improve table headers by enforcing spaces around `/`')
- parser.add_argument(
- '--pygments-lexer',
- help='Set particular code-block lexer language')
- self.args = parser.parse_args()
- retval = os.getcwd()
- os.chdir(self.args.path)
- meta_data = json.loads(open("CLASS.TXT.json").read())
- metadata_by_uri = dict()
- metadata_by_code = dict()
-
- for f in meta_data:
- f['new_name'] = self.get_new_name(f['title'])
- metadata_by_uri[f['uri']] = f
- metadata_by_code[f.get('code')] = f
-
- tree = self.build_doc_tree(metadata_by_code)
-
- pathlib.Path("temp/").mkdir(parents=True, exist_ok=True)
-
- # Scan all docs for anchors
- for f in pathlib.Path().glob("*.html"):
- if f.name not in metadata_by_uri:
- continue
- # Registering section links
- with open(f, 'r') as reader:
- logging.debug(f"Scanning {f.name}")
- content = reader.read()
- soup = bs4.BeautifulSoup(content, "lxml")
- for lnk in soup.body.find_all('a'):
- if "name" in lnk.attrs and lnk.string is None:
- anchor = lnk.attrs["name"]
- title = re.sub('[ _:]', '-', anchor)
- res = dict(
- fname=f.name,
- title=title,
- replace=title.lower()
- )
- self.doc_anchors[anchor] = res
- if "href" in lnk.attrs and lnk["href"]:
- self.doc_links[lnk["href"].lower()] = f.name
-
- for f in pathlib.Path().glob("*.html"):
- if f.name not in metadata_by_uri:
- continue
- _target = metadata_by_uri[f.name]
- target = _target['new_name']
- target_path = self.get_target_path(
- _target['p_code'], metadata_by_code)
- pathlib.Path("temp/").mkdir(parents=True, exist_ok=True)
- pathlib.Path("tmp_result/" + target_path).mkdir(
- parents=True, exist_ok=True)
- pathlib.Path("result/" + target_path).mkdir(
- parents=True, exist_ok=True)
-
- # Pre-processing of html content
- with open(f, 'r') as reader, \
- open(f"temp/{target}.tmp", 'w') as writer:
- # if f.name not in [
- # "modelarts_21_0031.html",
- # "en-us_topic_0032380449.html"]:
- # continue
- logging.info(f"Pre-Processing {f} as {target}")
- content = reader.read()
- soup = bs4.BeautifulSoup(content, "lxml")
- proc = self.streamline_html(soup, f.name)
-
- for lnk in proc.find_all("a"):
- href = lnk.get('href')
- if href and not href.startswith('http'):
- # Internal link - replace with :ref:
- code = soup.new_tag('code')
- code['class'] = "interpreted-text"
- code['role'] = "ref"
- href_parts = href.split('#')
- if len(href_parts) > 1:
- # for anchor just use anchor ref
- link_target = href_parts[1].lower()
- else:
- # for other page - use only page name
- link_target = href_parts[0].replace(
- ".html", "").lower()
- if link_target:
- # Looks like an anchor on the same page
- code.string = f"{lnk.string} <{link_target}>"
- logging.debug(f" replace {lnk} with {code}")
- lnk.replace_with(code)
-
- # Drop parent link at the bottom of the page
- for parent in proc.find_all("p", class_="parentlink"):
- parent.decompose()
-
- logging.info(f'Saving file {writer.name}')
- writer.write(str(proc))
-
- # Convert html to rst
- os.system(
- f"pandoc 'temp/{target}.tmp' -f html "
- f"-o 'tmp_result/{target_path}/{target}.rst' "
- f"--ascii -s --wrap none"
- )
- # Post processing of rendered rst
- with open(f"tmp_result/{target_path}/{target}.rst", 'r') \
- as reader, \
- open(f"result/{target_path}/{target}.rst", 'w') as writer:
- logging.info(f"Post processing {target}")
- writer.write(f":original_name: {f.name}\n\n")
- # Add root file label
- writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
- # post process some usual stuff
- for line in reader.readlines():
- processed_line = re.sub(r'\.\.\\_', '.. _', line)
- processed_line = re.sub(r'√', 'Y', processed_line)
- processed_line = re.sub(
- r'public_sys-resources/', '', processed_line)
- processed_line = re.sub(
- r'image:: ', 'image:: /_static/images/',
- processed_line)
- processed_line = re.sub(
- r' :name: .*$', '', processed_line)
- processed_line = re.sub(
- r'\*\*Parent topic:.*$', '', processed_line)
- processed_line = re.sub(
- r'.. code:: screen$',
- r'.. code-block::', processed_line)
- for lexer in ["json", "bash", "text"]:
- processed_line = re.sub(
- f".. code:: {lexer}$",
- f".. code-block:: {lexer}", processed_line)
- if re.match(rf".. code:: {lexer}\s", processed_line):
- logging.error(
- f"'code-block: {lexer}' with something "
- "afterwards")
- exit(1)
- # spaces are important, since code-block may reside inside
- # of the cell
- processed_line = re.sub(
- r'.. code:: screen\s',
- r'.. code-block:: ', processed_line)
- processed_line = re.sub(
- r'.. code:: codeblock$',
- r'.. code-block::', processed_line)
- writer.write(processed_line)
-
- # Generate indexes
- for k, v in tree.items():
- path = ''
- title = 'Main Index'
- page_label = ''
- if k != 0:
- curr = metadata_by_code[k]
- title = curr['title']
- page_label = curr['uri'].replace(".html", "").lower()
- path = self.get_target_path(curr['code'], metadata_by_code)
- with open(f"result/{path}/index.rst", "w") as index:
- if page_label:
- index.write(f".. _{page_label}:\n\n")
- index.write('=' * (len(title)) + '\n')
- index.write(title + '\n')
- index.write('=' * (len(title)) + '\n')
- index.write('\n')
- index.write('.. toctree::\n')
- index.write(' :maxdepth: 1\n\n')
- for child in v:
- new_name = child['new_name']
- if child['code'] in tree:
- # If this is folder - add /index
- new_name = new_name + '/index'
- index.write(f" {new_name}\n")
-
- p = pathlib.Path(f"result/{path}.rst")
- if p.exists():
- logging.warning(
- f"{p.resolve()} is removed in favour"
- f" of result/{path}/index.rst")
- p.unlink()
-
- os.chdir(retval)
-
-
-if __name__ == "__main__":
- OTCDocConvertor().main()