#!/usr/bin/env python3

import argparse
import bs4
import json
import logging
import os
import pathlib
import re
import shutil

from jinja2 import FileSystemLoader, Environment, select_autoescape


class OTCDocConvertor:
    def __init__(self):
        self.doc_anchors = dict()
        self.doc_links = dict()

    @staticmethod
    def get_new_name(current_name):
        new_name = current_name.replace(" - ", "_")
        new_name = new_name.replace(" ", "_")
        new_name = new_name.replace("/", "_")
        new_name = new_name.replace("'", "")
        new_name = new_name.replace('"', "")
        new_name = new_name.replace("`", "")
        new_name = new_name.replace("´", "")
        new_name = new_name.replace(":", "")
        new_name = new_name.replace("?", "")
        new_name = new_name.replace("(", "")
        new_name = new_name.replace(")", "")
        new_name = new_name.lower()
        return new_name

    @staticmethod
    def build_doc_tree(metadata):
        flat_tree = dict()
        for k, v in metadata.items():
            parent_id = v.get("p_code")
            if not parent_id:
                parent_id = 0

            if parent_id not in flat_tree:
                flat_tree[parent_id] = list()
            flat_tree[parent_id].append(v)
        return flat_tree

    @classmethod
    def get_target_path(cls, code, metadata):
        if code in metadata:
            current = metadata[code]
            if not current.get("p_code"):
                return current["new_name"]
            else:
                return "{0}/{1}".format(
                    cls.get_target_path(current["p_code"], metadata),
                    current["new_name"],
                )
        else:
            return ""

    def make_label(self, soup, name):
        label = soup.new_tag("p")
        label.string = f"..\\_{name.lower()}:"
        return label

    def is_element_referred(self, ref, fname):
        return (
            ref in self.doc_links
            or "#" + ref in self.doc_links
            or fname + "#" + ref in self.doc_links
        )

    def streamline_html(self, soup, file_name):
        # Drop eventual header duplicated anchors
        fname = file_name.replace(".html", "").lower()
        page_anchors = set()
        met_page_anchors = dict()
        for lnk in soup.body.find_all("a"):
            name = None
            if "name" in lnk.attrs and lnk.string is None:
                name = lnk.attrs["name"].lower()
                if name in met_page_anchors:
                    # Such anchor already existed on this page, drop it
                    lnk.decompose()
                met_page_anchors[name] = True

            if name and name.lower() == fname:
                lnk.decompose()

        # Process divs
        for i in soup.body.find_all("div"):
            if "note" in i.get("class", []):
                # Notes
                del i["id"]
                if i.img:
                    i.img.decompose()
                notetitle = i.find("span", class_="notetitle")
                if notetitle:
                    title = soup.new_tag("div")
                    title["class"] = "title"
                    title.string = "Note:"
                    notetitle.replace_with(title)
            elif "warning" in i.get("class", []):
                # Warnings
                del i["id"]
                if i.img:
                    i.img.decompose()
                eltitle = i.find("span", class_="warningtitle")
                if eltitle:
                    title = soup.new_tag("div")
                    title["class"] = "title"
                    title.string = "Warning:"
                    eltitle.replace_with(title)
            elif "notice" in i.get("class", []):
                # Notices
                del i["id"]
                if i.img:
                    i.img.decompose()
                i["class"] = "important"
            elif "caution" in i.get("class", []):
                # Cautions
                del i["id"]
                if i.img:
                    i.img.decompose()
            elif "fignone" in i.get("class", []):
                # Figures
                # When we found figure generate local label (anchor)
                if i.get("id"):
                    logging.debug("place figure label")
                    i.insert_before(self.make_label(soup, i.get("id")))
                figure = soup.new_tag("figure")
                img = i.find("img")
                cap = i.find("span", class_="figcap")
                if cap is not None:
                    cap.name = "figcaption"
                    figure.append(cap)
                if img:
                    # Store all referred images for copying
                    self.doc_images.add(img["src"])
                    img["src"] = (
                        "/_static/images/"
                        + os.path.basename(img["src"])
                    )
                    del img["width"]
                    del img["height"]
                    del img["class"]
                    del img["title"]
                    del img["name"]
                    del img["id"]
                    figure.append(img)
                i.replace_with(figure)
            elif "section" in i.get("class", []):
                # Sections
                if i.get("id"):
                    # When we found section generate local label (anchor)
                    sec_id = i.get("id").lower()
                    if self.is_element_referred(sec_id, file_name):
                        page_anchors.add(sec_id)
                        i.insert_before(self.make_label(soup, sec_id))
                i.unwrap()
            elif i.get("id") and i.get("id").startswith("body"):
                i.unwrap()
            else:
                i.name = "p"

        # Process remaining images
        for img in soup.body.find_all("img"):
            if img["src"] and not img["src"].startswith("/_static/images"):
                self.doc_images.add(img["src"])
                img["src"] = "/_static/images/" + os.path.basename(img["src"])
                del img["width"]
                del img["height"]
                del img["class"]
                del img["title"]
                del img["id"]

        # Drop strong in table headers "/"
        for th in soup.body.find_all("th"):
            if th.p.strong:
                th.p.strong.unwrap()

        if self.args.improve_table_headers:
            # Add spaces around "/"
            for th in soup.body.find_all("th"):
                if hasattr(th, "p") and th.p.string:
                    th.p.string = re.sub(r"\b/\b", " / ", th.p.string)

        # Drop strong around links "/"
        for strong in soup.body.find_all("strong"):
            if strong.a:
                strong.unwrap()

        # table anchors - some tables are referred. Some are having anchor in
        # front, some not. In order to cope with that we analyze every table
        # and if it is referred - prepend anchor. Next anchor processing will
        # skiip it, since such anchor is already placed on the page
        for table in soup.body.find_all("table"):
            # Verify this is really called from somewhere:
            if table.get("id"):
                local_ref = table["id"].lower()
                if self.is_element_referred(local_ref, file_name):
                    # We now know something in the document wants this anchor -
                    # replace it with label
                    if local_ref not in page_anchors:
                        lnk = bs4.BeautifulSoup(
                            f"<p>..\\_{local_ref}:</p>", "html.parser"
                        )
                        table.insert_before(lnk)
                        page_anchors.add(local_ref)
                    else:
                        logging.debug(
                            "Not placing replaced anchor %s "
                            " since it already existed",
                            local_ref,
                        )

        # local anchors
        for lnk in soup.body.find_all("a"):
            if (
                lnk.string is None
                and hasattr(lnk, "name")
                and not re.match(r"^li\d+$", lnk.attrs["name"])
                # anywhere section
                and not re.match(r".*section\d+$", lnk.attrs["name"])
                # starts with table
                and not re.match(r"^table\d+$", lnk.attrs["name"])
            ):
                # Verify this is really called from somewhere:
                local_ref = lnk["name"].lower()
                if self.is_element_referred(local_ref, file_name):
                    # We now know something in the document wants this anchor -
                    # replace it with label
                    if local_ref not in page_anchors:
                        logging.debug("Adding anchor")
                        lnk.name = "p"
                        lnk.string = f"..\\_{local_ref}:"
                        del lnk["name"]
                        page_anchors.add(local_ref)
                    else:
                        logging.debug(
                            "Not placing replaced anchor %s "
                            " since it already existed",
                            local_ref,
                        )
                else:
                    logging.debug("Dropping unreferred link %s", lnk)

        # Undeline element should not be used at all
        for underline in soup.body.find_all("u"):
            underline.unwrap()

        for li in soup.body.find_all("li"):
            del li["id"]

        # Sometimes we have code blocks with line numbers.
        # <div class="codecoloring" codetype="xxx"><table class="xxx">
        # <tr><td class="linenos"><div class="linenodiv"><pre>1
        # 2
        # 3</pre></div></td><td class="code"><div class="highlight"><pre>....
        # </pre></div>
        for td_lines in soup.body.find_all("td", "linenos"):
            codeblock = td_lines.parent.find("td", "code")
            table = td_lines.find_parent("table")
            if codeblock and table:
                # Replace whole table with only codeblock td
                logging.debug("Replace %s with %s" % (table, codeblock))
                codeblock.name = "pre"
                del codeblock["class"]
                table.replace_with(codeblock)

        for pre in soup.body.find_all("pre"):
            text = pre.get_text()
            # if text.startswith("{"):
            #    pre["class"] = "data"
            if re.search(r"\[[a-z]*@\w+.*\][\s#>]?", text):
                # Something like "[root@ecs-test-0001 ~]#"
                pre["class"] = "console"
            elif re.match(r"^(GET|PUT|POST|DELETE)", text):
                # Something like "DELETE https://some_url"
                pre["class"] = "text"
            if "codeblock" in pre.get("class", []):
                # <pre class="codeblock"
                pre["class"] = "text"

        # A very dirty hack - if we have "*" inside of em (italic) regular
        # escaping is not working anymore (pandoc is not producing proper
        # output). We can only try to put whole italic content as a literal
        # block to avoid corruption.
        for em in soup.body.find_all("em"):
            if (
                em.string
                and em.string.find("*") != -1
                and em.parent.name not in ["b", "strong"]
            ):
                new = f"<code>{em.string}</code>"
                em.replace_with(bs4.BeautifulSoup(new, "html.parser"))

        escape_asterisk_re = r"\((\*)[\.,]"
        for p in soup.body.find_all(string=re.compile(escape_asterisk_re)):
            if p.string:
                curr = p.string
                part = re.search(escape_asterisk_re, curr)
                # If we have `<b> all files (*.*)</b>` - no need to escape
                if (
                    len(part.groups()) > 0
                    and p.parent.name not in ["b", "strong"]
                ):
                    logging.debug(
                        "Found asterisks to escape: %s", part.group(1)
                    )
                    new = curr.replace(
                        part.group(1), f"<code>{part.group(1)}</code>"
                    )
                    p.replace_with(bs4.BeautifulSoup(new, "html.parser"))

        # And now specialities
        rawize_strings = [
            # "\*\*\*\*\*\*",
            # r"([\\\/\:\*\?\"\~|<>]{4,})"
            # ModelArts UMN contain this "unallowed" sequence
            r"(\\/:\*\?\"<>\|)",
            # CSS UMN contain "SELECT exclude('*name') FROM my-index"
            r"(\*name)",
            # DMS UMN contain: (`~!@#$%^&*()-_=+\|[{}]:'",<.>/?)
            r"\(([\W\x60_]{10,})\)",
            # MRS UMN contain: /:*?"<>|\\;&,'`!{}[]$%+
            r"\s([^a-zA-Z0-9\s]{8,})",
            # BMS API contain sequence in a dedicated paragraph
            r"^([^a-zA-Z0-9\s]{10,})$",
            # OBS special chars - "\$" "\\" etc
            r"^(\\[\$\\bfnrtvu]{1})$",
            # CES contains: urn:smn:([a-z]|[A-Z]|[0-9]|\\-){1,32}:....
            r"\s(urn:smn:\(.*)\.",
        ]
        for to_rawize in rawize_strings:
            for p in soup.body.find_all(string=re.compile(to_rawize)):
                if p.string:
                    curr = p.string
                    part = re.search(to_rawize, curr)
                    # We should not escape inside of bold - this is wrong
                    if (
                        len(part.groups()) > 0
                        and p.parent.name not in ["b", "strong"]
                    ):
                        logging.debug(
                            "Found element to rawize: %s", part.group(1)
                        )
                        new = curr.replace(
                            part.group(1), f"<code>{part.group(1)}</code>"
                        )
                        logging.debug("Replacing string with: %s", new)
                        p.replace_with(bs4.BeautifulSoup(new, "html.parser"))
                        logging.debug("Replacing string with: %s", p.string)
                    else:
                        logging.error(
                            "Cannot find string for rawization anymore"
                        )
                logging.error(f"String with star: {p}")

        # Drop parent link at the bottom of the page
        for parent in soup.body.find_all("p", class_="familylinks"):
            parent.decompose()

        return soup.body

    def main(self):
        logging.basicConfig(level=logging.DEBUG)
        parser = argparse.ArgumentParser(description="Process links.")
        parser.add_argument("path", type=str, help="path to the files")
        parser.add_argument(
            "--improve-table-headers",
            action="store_true",
            help="Improve table headers by enforcing spaces around `/`",
        )
        parser.add_argument(
            "--pygments-lexer", help="Set particular code-block lexer language"
        )
        parser.add_argument(
            "--dest", help="Directory to write resulting files"
        )
        parser.add_argument("--title", required=True, help="Document title")
        parser.add_argument(
            "--service", help="Service to which the document belongs to"
        )
        parser.add_argument("--repo-name", help="Service repository")
        parser.add_argument("--pdf-name", help="PDF File name")
        parser.add_argument(
            "--templates-location",
            default="templates",
            help="Location of additional templates",
        )
        self.args = parser.parse_args()
        if self.args.dest:
            dest = pathlib.Path(self.args.dest)
        else:
            dest = pathlib.Path(self.args.path, "result")
        dest.mkdir(parents=True, exist_ok=True)

        metadata_file = pathlib.Path(self.args.path, "CLASS.TXT.json")
        meta_data = dict()

        if not metadata_file.exists():
            logging.warning(
                f"CLASS.TXT.json file is missing in {self.args.path}, "
                f"assuming initial import"
            )
            with open(pathlib.Path(dest, "index.rst"), "w") as index:
                index.write("=" * (len(self.args.title)) + "\n")
                index.write(self.args.title + "\n")
                index.write("=" * (len(self.args.title)) + "\n")
                index.write("\n")
        else:
            meta_data = json.loads(open(metadata_file).read())
        metadata_by_uri = dict()
        metadata_by_code = dict()
        self.doc_images = set()
        for f in meta_data:
            f["new_name"] = self.get_new_name(f["title"])
            metadata_by_uri[f["uri"]] = f
            metadata_by_code[f.get("code")] = f

        tree = self.build_doc_tree(metadata_by_code)

        pathlib.Path(self.args.path, "temp/").mkdir(
            parents=True, exist_ok=True
        )

        # Scan all docs for anchors
        for f in pathlib.Path(self.args.path).glob("*.html"):
            if f.name not in metadata_by_uri:
                continue
            # Registering section links
            with open(f, "r") as reader:
                logging.debug(f"Scanning {f.name}")
                content = reader.read()
                soup = bs4.BeautifulSoup(content, "lxml")
                for lnk in soup.body.find_all("a"):
                    if "name" in lnk.attrs and lnk.string is None:
                        anchor = lnk.attrs["name"]
                        title = re.sub("[ _:]", "-", anchor)
                        res = dict(
                            fname=f.name, title=title, replace=title.lower()
                        )
                        self.doc_anchors[anchor] = res
                    if "href" in lnk.attrs and lnk["href"]:
                        self.doc_links[lnk["href"].lower()] = f.name

        for f in pathlib.Path(self.args.path).glob("*.html"):
            if f.name not in metadata_by_uri:
                continue
            _target = metadata_by_uri[f.name]
            target = _target["new_name"]
            target_path = self.get_target_path(
                _target["p_code"], metadata_by_code
            )
            pathlib.Path(self.args.path, "temp").mkdir(
                parents=True, exist_ok=True
            )
            pathlib.Path(self.args.path, "tmp_result/" + target_path).mkdir(
                parents=True, exist_ok=True
            )
            pathlib.Path(dest, target_path).mkdir(parents=True, exist_ok=True)

            # Pre-processing of html content
            with open(f, "r") as reader, open(
                pathlib.Path(self.args.path, f"temp/{target}.tmp"), "w"
            ) as writer:
                # if f.name not in [
                # ]:
                #     continue
                logging.info(f"Pre-Processing {f} as {target}")
                content = reader.read()
                # Preprocess - Fix space inside link and not text
                # i.e. `<p>Some <a>link </a>in text</p>
                content = re.sub(r"\s</a>", "</a> ", content)
                content = re.sub(r"√", "Y", content)
                content = re.sub(r"×", "x", content)
                content = re.sub(r"π", "Pi", content)
                content = re.sub(r"–", "-", content)
                content = re.sub(r"≤", "<=", content)
                content = re.sub(r"≥", ">=", content)
                soup = bs4.BeautifulSoup(content, "lxml")
                proc = self.streamline_html(soup, f.name)

                for lnk in proc.find_all("a"):
                    href = lnk.get("href")
                    if href and not href.startswith("http"):
                        # Internal link - replace with :ref:
                        code = soup.new_tag("code")
                        code["class"] = "interpreted-text"
                        code["role"] = "ref"
                        href_parts = href.split("#")
                        if len(href_parts) > 1:
                            # for anchor just use anchor ref
                            link_target = href_parts[1].lower()
                        else:
                            # for other page - use only page name
                            link_target = (
                                href_parts[0].replace(".html", "").lower()
                            )
                        if link_target:
                            # Looks like an anchor on the same page
                            code.string = f"{lnk.string} <{link_target}>"
                            logging.debug(f" replace {lnk} with {code}")
                            lnk.replace_with(code)

                logging.info(f"Saving file {writer.name}")
                writer.write(str(proc))

            # Convert html to rst
            os.system(
                f"pandoc '{self.args.path}/temp/{target}.tmp' -f html "
                f"-o '{self.args.path}/tmp_result/{target_path}/{target}.rst' "
                f"--ascii -s --wrap none"
            )
            # Post processing of rendered rst
            with open(
                f"{self.args.path}/tmp_result/" f"{target_path}/{target}.rst",
                "r",
            ) as reader, open(
                pathlib.Path(dest, target_path, f"{target}.rst"), "w"
            ) as writer:
                logging.info(f"Post processing {target}...")
                writer.write(f":original_name: {f.name}\n\n")
                # Add root file label
                writer.write(f".. _{f.name.replace('.html', '')}:\n\n")
                # post process some usual stuff
                for line in reader.readlines():
                    processed_line = re.sub(r"\.\.\\\\_", ".. _", line)
                    processed_line = re.sub(r"\.\.\\_", ".. _", processed_line)
                    # We could get unwanted anchors from pandoc - get rid of
                    # them
                    anchor = re.search(r"\.\. \_(.*):", processed_line)
                    if anchor and len(anchor.groups()) > 0:
                        if not self.is_element_referred(
                            anchor.group(1), f.name
                        ):
                            # This is most likely some duplicated anchor. It is
                            # not referred from any other place so drop it
                            logging.info("Dropping not referred anchor")
                            continue

                    processed_line = re.sub(
                        r"public_sys-resources/", "", processed_line
                    )
                    processed_line = re.sub(
                        r"   :name: .*$", "", processed_line
                    )
                    processed_line = re.sub(
                        r"\*\*Parent topic:.*$", "", processed_line
                    )
                    processed_line = re.sub(
                        r".. code:: screen$",
                        r".. code-block::",
                        processed_line,
                    )
                    for lexer in ["json", "bash", "text", "console"]:
                        processed_line = re.sub(
                            f".. code:: {lexer}$",
                            f".. code-block:: {lexer}",
                            processed_line,
                        )
                        if re.match(rf".. code:: {lexer}\s", processed_line):
                            logging.error(
                                f"'code-block: {lexer}' with something "
                                "afterwards"
                            )
                            exit(1)
                    # spaces are important, since code-block may reside inside
                    # of the cell
                    processed_line = re.sub(
                        r".. code:: screen\s",
                        r".. code-block::  ",
                        processed_line,
                    )
                    processed_line = re.sub(
                        r".. code:: codeblock$",
                        r".. code-block::",
                        processed_line,
                    )
                    processed_line = re.sub(r"[ \t]*$", "", processed_line)
                    writer.write(processed_line)

        # Generate indexes
        for k, v in tree.items():
            path = ""
            title = self.args.title
            page_label = ""
            if k != 0:
                curr = metadata_by_code[k]
                title = curr["title"]
                page_label = curr["uri"].replace(".html", "").lower()
                path = self.get_target_path(curr["code"], metadata_by_code)

            p = pathlib.Path(dest, f"{path}.rst")
            if p.exists():
                logging.warning(
                    f"{p.resolve()} is renamed into {path}/index.rst"
                )
                # Update existing index file
                p.rename(pathlib.Path(dest, f"{path}/index.rst"))
                with open(pathlib.Path(dest, path, "index.rst"), "a") as index:
                    index.write("\n")
                    index.write(".. toctree::\n")
                    index.write("   :maxdepth: 1\n")
                    index.write("   :hidden: \n\n")
                    for child in v:
                        new_name = child["new_name"]
                        if child["code"] in tree:
                            # If this is folder - add /index
                            new_name = new_name + "/index"
                        index.write(f"   {new_name}\n")
            else:
                with open(pathlib.Path(dest, path, "index.rst"), "w") as index:
                    # New index file
                    if page_label:
                        index.write(f".. _{page_label}:\n\n")
                    index.write("=" * (len(title)) + "\n")
                    index.write(title + "\n")
                    index.write("=" * (len(title)) + "\n")
                    index.write("\n")
                    index.write(".. toctree::\n")
                    index.write("   :maxdepth: 1\n\n")
                    for child in v:
                        new_name = child["new_name"]
                        if child["code"] in tree:
                            # If this is folder - add /index
                            new_name = new_name + "/index"
                        index.write(f"   {new_name}\n")
        # Copy used images
        if len(self.doc_images) > 0:
            logging.debug("Processing images...")
            img_dest = pathlib.Path(dest, "_static", "images")
            img_dest.mkdir(parents=True, exist_ok=True)
            for img in self.doc_images:
                shutil.copyfile(
                    pathlib.Path(self.args.path, img).resolve(strict=False),
                    pathlib.Path(img_dest, os.path.basename(img)).resolve(
                        strict=False
                    ),
                )

        context = dict(
            title=self.args.title,
            project=self.args.service,
            repo_name=self.args.repo_name,
            pdf_name=self.args.pdf_name,
        )
        loader = FileSystemLoader([self.args.templates_location])
        env = Environment(loader=loader, autoescape=select_autoescape())
        for f in loader.list_templates():
            outfile_tmpl = env.get_template(f)
            outfile_rendered = outfile_tmpl.render(**context)
            target_file = pathlib.Path(self.args.dest, f)
            target_file.parent.mkdir(parents=True, exist_ok=True)
            with open(target_file, "w", encoding="utf-8", newline="") as out:
                logging.debug(f"Generating {f} from template...")
                out.write(outfile_rendered)


def main():
    OTCDocConvertor().main()


if __name__ == "__main__":
    main()