#!/usr/bin/env python3 import argparse import os import sys import time import pypdf #from PyPDF2 import PdfReader from pypdf import PdfReader import re # regular expression import pandas as pd __version__ = '1.0.0' __website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html' from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service as ChromiumService def call_description(): print(f'Download and Parse the Austrian Callbook Version {__version__}') def call_parser(): parser = argparse.ArgumentParser( description='Download and Parse the Austrian Callbook', epilog=f''' Written by Thomas Kuschel, Version {__version__} ''' ) parser.add_argument('--interactive', '-i', action='store_true', default=False) # parser.add_argument('--server', '-s', default=__website__, required=False) parser.add_argument('--version', '-v', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__)) parser.add_argument('url', metavar='URL', nargs='?', default=__website__) return parser.parse_args() def call_website(url,interactive=False): if(interactive): print('Interactive') driver=webdriver.Chrome() else: print('Headless Script') options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome(options=options) driver.get(url) print(driver.title) # elements = driver.find_elements(By.XPATH,'//a[contains(@href,"Rufzeichen")]') elements = driver.find_elements(By.PARTIAL_LINK_TEXT,"Rufzeichen") if elements: element = elements[0] href = element.get_attribute('href') filename = element.click() # take the first one else: print('Sorry, no Link containing "Rufzeichen" found.') driver.close() sys.exit(2) print(element.text) # print(href) if(interactive): time.sleep(300) else: time.sleep(5) driver.close() return os.path.basename(href) def get_pdf_content_lines(pdf_file_path): with open(pdf_file_path) as f: pdf_reader = PdfReader(f) for page in pdf_reader.pages: for line in page.extractText().spitlines(): yield line def call_analyse_pdf(file): # Define a regular expression to match tables reader = PdfReader(file) meta = reader.metadata print(' Pages:', len(reader.pages)) # All of the following could be None! print(f' Author: {meta.author}') print(f' Creator: {meta.creator}') print(f'Producer: {meta.producer}') print(f' Subject: {meta.subject}') print(f' Title: {meta.title}') print(f' Created: {meta.creation_date}') print(f'Modified: {meta.modification_date}') for page in reader.pages: page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False) #print(page_text) # page_text = page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0) # print(page_text) # Find all tables in page_text calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text) for call in calls: print(call) tables = re.findall(r'(?s)\b(?:\w+\s+){2,}\w+\b(?:\s*[,;]\s*\b(?:\w+\s+){2,}\w+\b)*', page_text) # Loop through each table and create a pandas DataFrame for table in tables: # Split the table into rows rows = table.strip().split('\n') # Split the rows into cells cells = [row.split('|') for row in rows] # Remove leading and trailing whitespace from cells cells = [[cell.strip() for cell in row] for row in cells] # Remove empty rows and columns cells = [[cell for cell in row if cell] for row in cells if row] # Create a pandas DataFrame from the cells df = pd.DataFrame(cells[1:], columns=cells[0]) # TODO: Clean and manipulate the df as needed if __name__ == '__main__': # call_description() args = call_parser() try: filename = call_website(**vars(args)) print(f'Filename: {filename}') call_analyse_pdf(filename) sys.exit(0) except Exception as e: print('Error: {}'.format(e), file=sys.stderr) sys.exit(1)