#!/usr/bin/env python3 import argparse import os import sys import time import pypdf #from PyPDF2 import PdfReader from pypdf import PdfReader import re # regular expression import pandas as pd __version__ = '1.0.0' __website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html' from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service as ChromiumService def call_description(): print(f'Download and Parse the Austrian Callbook Version {__version__}') def call_parser(): parser = argparse.ArgumentParser( description='Download and Parse the Austrian Callbook', epilog=f''' Written by Thomas Kuschel, Version {__version__} ''' ) parser.add_argument('-i', '--interactive', action='store_true', default=False) # parser.add_argument('-s', '--server' default=__website__, required=False) parser.add_argument('-V', '--version', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__)) parser.add_argument('-v', '--verbose', action='append_const', const = 1) parser.add_argument('-p', '--path', default='Rufzeichenliste_AT_Stand_010624.pdf', help= 'skip the download if the specified path to a PDF file exists') parser.add_argument('url', metavar='URL', nargs='?', default=__website__) opt = parser.parse_args() opt.verbose = 0 if opt.verbose is None else sum(opt.verbose) return opt def call_website(url,verbose,path='',interactive=False): if path: if os.path.exists(path): return path else: print(f'The given path "{path}" does not exist.') sys.exit(3) if(interactive): print('Interactive') driver=webdriver.Chrome() else: print('Headless Script') options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome(options=options) driver.get(url) print(driver.title) # elements = driver.find_elements(By.XPATH,'//a[contains(@href,"Rufzeichen")]') elements = driver.find_elements(By.PARTIAL_LINK_TEXT,"Rufzeichen") if elements: element = elements[0] href = element.get_attribute('href') filename = element.click() # take the first one else: print('Sorry, no Link containing "Rufzeichen" found.') driver.close() sys.exit(2) print(element.text) # print(href) if(interactive): time.sleep(300) else: time.sleep(5) driver.close() return os.path.basename(href) def remove_first_quote_if_odd(text, verbose = 0): double_quote_cnt = text.count('"') # single_quote_cnt = text.count("'") if (double_quote_cnt % 2 != 0): # or (single_quote_cnt % 2 != 0): # Find and remove the first quote for i, char in enumerate(text): if char in ['"']: # ['"', "'"]: if (verbose > 0): print(text) text = text[:i] + text[i+1:] if (verbose > 0): print(text) break return text def is_clubstation(call): assert(len(call) > 3) if call[3].upper() == 'X': return True return False def replace_substring_with_line(path, search_substring, verbose=0): try: with open(path, 'r') as file: lines = file.readlines() search_substring for line in lines: if search_substring[0:40].lower() in line.lower(): modified_line = line.strip() # Replace the substring with the whole line ## line = line.lower().replace(search_substring.lower(), modified_line) modified_line = line return modified_line except FileNotFoundError: print(f'The file {path} was not found.') except Exception as e: print(f'An error occurred: {e}') return search_substring def call_data_record(line, mod_date, verbose): # we have to split the record with a cost-intensive regular expression # record = re.split('OE[0-9][A-Z]{1,3}[ \t]{3,20}',line) # this does not work 100% # record = re.findall(r'(OE[0-9][A-Z]{1,3})[ \t]{2,12}([A-ZÄÖÜ].+[ ]?.*[ ]?.*[ ]?.*)[ \t]{3,30}(.{3,30})[ \t]{3,30}([1,3,4])', line) # record = re.search(r'(OE[0-9][A-Z]{1,3})[ ]{2,12}([. ]+)[ ]{3,50}([. ]+)[ ]{3,50}([1-4])', line) # record = re.search(r'^(OE[0-9][A-Z]{1,3})[ \t]{2,20}([\w ]{1,12})[ ]{3,50}(.*)([1-4]{1})$', line) # Never split Addresses containing 2 or 3 spaces, also several records contain no address or no location records = re.split(r'[ ]{4,65}', line) # [records for record in records] if verbose > 2 : print(f'Record length: {len(records)}') for m in records: print(m) # HARDENING: assert(len(records) == 5) # OE Call: call = records[0] match = re.search(r'^(OE[0-9][A-Z]{1,3})', call) assert(match.string == call) fullname = records[1] # If there is a clubstation if is_clubstation(call): # Name starting with only one quotation marks e.g. " -- remove that one: fullname = remove_first_quote_if_odd(fullname, verbose) # only found @ clubstations clubstationfile = '.callbook_club' if verbose > 0: print(f'Call: {call}, Name: {fullname}') path = os.path.join(os.path.dirname(os.path.abspath(__file__)), clubstationfile) if os.path.exists(path): fullname = replace_substring_with_line(path, fullname, verbose) if verbose > 0: print(f'Call: {call}, Name: {fullname}') # if not record: # return # if verbose == 1: # print(record.group(1)) # if verbose >= 3: # print(f'Call: {record.group(1)}') # print(f'Name: {record.group(2)}') # #print(f'Location: {record[3]}') # #print(f'Address: {record[4]}') # #print(f'Permit Class: {record[5]}') def call_analyse_pdf(file, verbose): # Define a regular expression to match tables reader = PdfReader(file) meta = reader.metadata if verbose: print(verbose) print(' Pages:', len(reader.pages)) # All of the following could be None! print(f' Author: {meta.author}') print(f' Creator: {meta.creator}') print(f'Producer: {meta.producer}') print(f' Subject: {meta.subject}') print(f' Title: {meta.title}') print(f' Created: {meta.creation_date}') print(f'Modified: {meta.modification_date}') for page in reader.pages: page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False) lines = page_text.strip().splitlines() for line in lines[3:-2]: line = line.strip() # calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text) if verbose >= 2: print(line) call_data_record(line, meta.modification_date,verbose) if __name__ == '__main__': # call_description() args = call_parser() # filename = 'Rufzeichenliste_AT_Stand_010624.pdf' try: filename = call_website(**vars(args)) print(f'Filename: {filename}') call_analyse_pdf(filename,args.verbose) sys.exit(0) except Exception as e: print('Error: {}'.format(e), file=sys.stderr) sys.exit(1)