hc-spider/hc_spider/controller.py

44 lines
1.4 KiB
Python

import os
import threading
import time
from hc_spider.model import SharedObjects
class Controller(threading.Thread):
_shared_objects: SharedObjects
def __init__(self, **kwargs) -> None:
self._shared_objects = SharedObjects(**kwargs)
super().__init__()
self.daemon = True
self.name = "Controller"
def start(self) -> None:
print(f"[{self.name}] is starting")
super().start()
def run(self) -> None:
print(f"{self.name} started with pid [{os.getpid()}]")
# Enqueue starting point
starting_url = self._shared_objects.config.get("starting_point")
self._shared_objects.not_visited_nodes[starting_url] = "from config.json"
self._shared_objects.job_queue.put(starting_url)
# Should wait a bit until the first items from workers will be placed in the queue
time.sleep(2)
counter = 2
while self._shared_objects.shutdown_event.is_set() is False:
time.sleep(2)
if not self._shared_objects.not_visited_nodes:
counter -= 1
if counter == 0:
self._shared_objects.shutdown_event.set()
print(f"[{self.name}] Ran out from not visited URLs, exiting...")
print(f"[{self.name}] is shutting down", flush=True)
def __del__(self) -> None:
print(f"[{self.name}] exited", flush=True)