From 2d8e03d2d593527ef541aa250160c982aa0b9579 Mon Sep 17 00:00:00 2001 From: Tamas Szirtesi Date: Mon, 9 Oct 2023 14:24:07 +0200 Subject: [PATCH] Added controller module --- hc_spider/controller.py | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) create mode 100644 hc_spider/controller.py diff --git a/hc_spider/controller.py b/hc_spider/controller.py new file mode 100644 index 0000000..3a70a11 --- /dev/null +++ b/hc_spider/controller.py @@ -0,0 +1,43 @@ +import os +import threading +import time + +from hc_spider.model import SharedObjects + + +class Controller(threading.Thread): + _shared_objects: SharedObjects + + def __init__(self, **kwargs) -> None: + self._shared_objects = SharedObjects(**kwargs) + super().__init__() + self.daemon = True + self.name = "Controller" + + def start(self) -> None: + print(f"[{self.name}] is starting") + super().start() + + def run(self) -> None: + print(f"{self.name} started with pid [{os.getpid()}]") + # Enqueue starting point + starting_url = self._shared_objects.config.get("starting_point") + self._shared_objects.not_visited_nodes[starting_url] = "from config.json" + self._shared_objects.job_queue.put(starting_url) + + # Should wait a bit until the first items from workers will be placed in the queue + time.sleep(2) + + counter = 2 + while self._shared_objects.shutdown_event.is_set() is False: + time.sleep(2) + if not self._shared_objects.not_visited_nodes: + counter -= 1 + if counter == 0: + self._shared_objects.shutdown_event.set() + print(f"[{self.name}] Ran out from not visited URLs, exiting...") + + print(f"[{self.name}] is shutting down", flush=True) + + def __del__(self) -> None: + print(f"[{self.name}] exited", flush=True)