#!/usr/bin/env python3 import argparse import os import mariadb import sys import time import pypdf from pypdf import PdfReader import re # regular expression import datetime __version__ = '1.0.0' __website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html' from selenium import webdriver from selenium.webdriver.common.by import By from selenium.webdriver.chrome.service import Service as ChromiumService def call_description(): print(f'Download and Parse the Austrian Callbook Version {__version__}') def call_parser(): parser = argparse.ArgumentParser( description='Download and Parse the Austrian Callbook', epilog=f''' Written by Thomas Kuschel, Version {__version__} ''' ) parser.add_argument('-i', '--interactive', action='store_true', default=False) # parser.add_argument('-s', '--server' default=__website__, required=False) parser.add_argument('-V', '--version', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__)) parser.add_argument('-v', '--verbose', action='append_const', const = 1) # Rufzeichenliste_AT_Stand_010624.pdf parser.add_argument('-p', '--path', default='Rufzeichenliste_AT_Stand_010624.pdf', help= 'skip the download if the specified path to a PDF file exists') # parser.add_argument('-t', '--type', default='' , help='specify the output, supported types are [ CSV | JSON ]') # not implemented yet parser.add_argument('-o', '--output', default='', help='specify the file where the data are written to, default stdout') parser.add_argument('-m', '--mariadb', help='SQL interface to MariaDB (MySql) format ": " or defined in .config') parser.add_argument('url', metavar='URL', nargs='?', default=__website__) opt = parser.parse_args() opt.verbose = 0 if opt.verbose is None else sum(opt.verbose) ask = opt.path != '' base = os.path.basename(opt.path) dir = os.path.dirname(opt.path) if opt.path != '' and os.path.dirname(opt.path) == '': opt.path = os.path.join(os.path.dirname(os.path.abspath(__file__)), opt.path) return opt def call_website(url,verbose,path='',interactive=False,output='',mariadb=''): if path: if os.path.exists(path): return path else: print(f'The given path "{path}" does not exist.') sys.exit(3) if(interactive): print('Interactive') driver=webdriver.Chrome() else: print('Headless Script') options = webdriver.ChromeOptions() options.add_argument('--headless') options.add_argument('--no-sandbox') options.add_argument('--disable-dev-shm-usage') driver = webdriver.Chrome(options=options) driver.get(url) print(driver.title) # elements = driver.find_elements(By.XPATH,'//a[contains(@href,"Rufzeichen")]') elements = driver.find_elements(By.PARTIAL_LINK_TEXT,"Rufzeichen") if elements: element = elements[0] href = element.get_attribute('href') filename = element.click() # take the first one else: print('Sorry, no Link containing "Rufzeichen" found.') driver.close() sys.exit(2) print(element.text) # print(href) if(interactive): time.sleep(300) else: time.sleep(4) driver.close() return os.path.basename(href) def remove_first_quote_if_odd(text, verbose = 0): double_quote_cnt = text.count('"') # single_quote_cnt = text.count("'") if (double_quote_cnt % 2 != 0): # or (single_quote_cnt % 2 != 0): # Find and remove the first quote for i, char in enumerate(text): if char in ['"']: # ['"', "'"]: if (verbose > 1): print(text) text = text[:i] + text[i+1:] if (verbose > 1): print(text) break return text def is_clubstation(call): assert(len(call) > 3) if call[3].upper() == 'X' or call.upper() == 'OE5SIX': # special case with OE5SIX (Clubstation) return True return False def replace_substring_with_line(path, search_substring, verbose=0): try: if not replace_substring_with_line.lines: with open(path, 'r') as file: replace_substring_with_line.lines = file.readlines() for line in replace_substring_with_line.lines: if search_substring[0:46].lower() in line.lower(): return line.strip() except FileNotFoundError: print(f'The file {path} was not found.') except Exception as e: print(f'An error occurred: {e}') return search_substring replace_substring_with_line.lines = None def gender_substring(path, search_substring, verbose=0): try: if not gender_substring.lines: with open(path, 'r') as file: gender_substring.lines = file.readlines() for line in gender_substring.lines: if line[2:].strip() == search_substring: # search from position 2 and remove all spaces or \n chars return line[0] # return the char of gender i.e. 'f' or 'm' except FileNotFoundError: print(f'The file {path} was not found.') except Exception as e: print(f'An error occurred: {e}') return 'x' # not found, unknown gender gender_substring.lines = None def get_gender(firstnames, surname, call, verbose=0): # load the .gender file: genderfile = '.gender' gender = 'x' gpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), genderfile) if os.path.exists(gpath): firstname = firstnames.split(' ', 1)[0] gender = gender_substring(gpath, firstname, verbose=0) # only check 1st/firstname of name, important when there are more than 1 firstnames if gender == 'x': if verbose > 0: get_gender.cnt += 1 print(f'({get_gender.cnt}){call} "{firstname}" [{firstnames} {surname}] not found in file {genderfile} - gender "x" is set.') return gender get_gender.cnt = 0 def call_split_name(fullname, call, verbose): assert(len(fullname) > 1) name = fullname.split(' ', 1) surname = name[0] # several special cases like surname "de Lijezer", "van Dijk", "el Shamaa", "da Silva", etc. match surname.lower(): case 'de' | 'el' | 'da': name = fullname[3:].split(' ',1) surname = surname.lower() + ' ' + name[0] if verbose > 1: print(f'## {fullname} --> {surname} ##') case 'van' | 'von' : name = fullname[4:].split(' ',1) surname = surname.lower() + ' ' + name[0] if surname.lower() in ['van der', 'von der', 'van den']: # e.g. "van der Meulen", "Walther von der Vogelweide", "Annie van den Berg" name = fullname[8:].split(' ',1) surname = surname.lower() + ' ' + name[0] if verbose > 1: print(f'## {fullname} --> {surname} ##') case 'della' : # Ancient Italian noble family "della Rowere" name = fullname[6:].split(' ',1) surname = surname.lower() + ' ' + name[0] if verbose > 1: print(f'## {fullname} --> {surname} ##') case 'senarclens' : # Senarclens de Grancy if fullname.lower().startswith('senarclens de grancy'): name = fullname[21:].split(' ',1) surname = 'Senarclens de Grancy' if verbose > 1: print(f'## {fullname} --> {surname} ##') if len(name) > 1: firstname = name[1].lstrip() # FIX when there are more than 1 space b/w surname and firstname else: firstname = '' # In Austria the call suffix starting with Y is an YL (young lady) # if call[3].upper() == 'Y': if False: gender = 'f' else: gender = get_gender(firstname, surname, call, verbose) return firstname, surname, gender def fix_typo(call, fullname, verbose=1): fixtypofile = '.typo_callbook' path = os.path.join(os.path.dirname(os.path.abspath(__file__)), fixtypofile) if os.path.exists(path): try: if not fix_typo.lines: with open(path, 'r') as file: fix_typo.lines = file.readlines() if verbose > 0: print(f'File "{fixtypofile}":') for line in fix_typo.lines: print(f'>> {line.rstrip()}') print('>> ** EOF **') for line in fix_typo.lines[4:]: # starting with line 4 if len(fix_typo.spaces) == 0: # not initialized if line[0] == '*': words = line.split() assert len(words) == 4 # i.e. '*, call, nachname, vorname' fix_typo.spaces = [line.index(words[1]), line.index(words[2]), line.index(words[3])] else: if call in line[2:8]: if verbose > 1: print(f'Call: {call} found') match line[0]: case '#': if verbose > 1: print(line.rstrip()) case 'F' | 'N': if verbose > 0: print(line.rstrip()) print(fullname) firstname1, surname1, gender1 = call_split_name(fullname, call, 0) fullname2 = line[fix_typo.spaces[1]:fix_typo.spaces[2]-1].rstrip() + ' ' + line[fix_typo.spaces[2]:-1] if verbose > 0: print(fullname2) firstname2, surname2, gender2 = call_split_name(fullname2, call, 0) # Hardening: at a minimum, either the firstnames or the surenames must fit fix_cnt = 0 if (firstname1 != firstname2): fix_cnt += 1 if (surname1 != surname2): fix_cnt += 1 # when the surname is splitted and wrong written: if line[0] == 'N' and firstname2 in fullname: if fix_cnt > 1: fix_cnt = 1 if fix_cnt == 0: print(f'It is fixed! You can remove the line with the item {call} from the file {fixtypofile}!') elif fix_cnt > 1: print(f'Something went wrong, there are several bugs. Check line with call {call} in file {fixtypofile}!') else: fullname = fullname2 case 'X': # exchange the surname with firstname if verbose > 0: print(line.rstrip()) print(fullname) firstname1, surname1, gender1 = call_split_name(fullname, call, 0) fullname2 = line[fix_typo.spaces[1]:fix_typo.spaces[2]-1].rstrip() + ' ' + line[fix_typo.spaces[2]:-1] if verbose > 0: print(fullname2) firstname2, surname2, gender2 = call_split_name(fullname2, call, 0) fix_cnt = 0 if (firstname1 == firstname2) and (surname1 == surname2): print(f'It is fixed! You can remove the line with the item {call} from the file {fixtypofile}!') elif (firstname1 != surname2) or (surname1 != firstname2): print(f'Something went wrong, there are several bugs. Check line with call {call} in file {fixtypofile}!') else: fullname = fullname2 except FileNotFoundError: print(f'The file {path} was not found.') except Exception as e: print(f'An error occurred: {e}') return fullname fix_typo.lines = None fix_typo.spaces = [] def call_data_record(line, mod_date, verbose, cur): # we have to split the record with a cost-intensive regular expression # record = re.split('OE[0-9][A-Z]{1,3}[ \t]{3,20}',line) # this does not work 100% # record = re.findall(r'(OE[0-9][A-Z]{1,3})[ \t]{2,12}([A-ZÄÖÜ].+[ ]?.*[ ]?.*[ ]?.*)[ \t]{3,30}(.{3,30})[ \t]{3,30}([1,3,4])', line) # record = re.search(r'(OE[0-9][A-Z]{1,3})[ ]{2,12}([. ]+)[ ]{3,50}([. ]+)[ ]{3,50}([1-4])', line) # record = re.search(r'^(OE[0-9][A-Z]{1,3})[ \t]{2,20}([\w ]{1,12})[ ]{3,50}(.*)([1-4]{1})$', line) # Never split Addresses containing 2 or 3 spaces, also several records contain no address or no location records = re.split(r'[ ]{4,65}', line) # [records for record in records] if verbose > 2 : print(f'Record length: {len(records)}') for m in records: print(m) # HARDENING: assert(len(records) == 5) # OE Call: call = records[0] match = re.search(r'^(OE[0-9][A-Z]{1,3})', call) assert(match.string == call) fullname = records[1] location = records[2] address = records[3] permit_class = records[4] fullname = fix_typo(call, fullname, verbose) firstname = '' surname = '' # If there is a clubstation if is_clubstation(call): # Name starting with only one quotation marks e.g. " -- remove that one: fullname = remove_first_quote_if_odd(fullname, verbose) # only found @ clubstations clubstationfile = '.callbook_club' if verbose > 1: print(f'Call: {call}, Name: {fullname}') path = os.path.join(os.path.dirname(os.path.abspath(__file__)), clubstationfile) if os.path.exists(path): fullname = replace_substring_with_line(path, fullname, verbose) gender = '*' firstname = fullname.strip() elif fullname[0] == '*': gender = '*' else: # Try to split the YL or OMs Name, guess the gender firstname, surname, gender = call_split_name(fullname, call, verbose) if verbose > 1: if gender == '*': print(f'Call: {call}, Name: {fullname}, Gender: {gender}') else: print(f'Call: {call}, First Name: {firstname}, Surname: {surname}, Gender: {gender}') print(f'Location: {location}, Address: {address}, Permit: {permit_class}') created = datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S') call_data_record.cnt += 1 # increment the User_id user_id = call_data_record.cnt # print(created) statement = "INSERT INTO `callbook_user`(`user_id`,`call`,`firstname`,`surname`,`gender`,`created`,`modified`) VALUES (%s, %s, %s, %s, %s, %s, %s)" data = (user_id,call,firstname,surname,gender,created,created) try: # cur.execute(f"INSERT INTO `callbook_user` (`call`,`firstname`,`surname`,\ # `created`,`created_by`,`modified`,`modified_by`,`active`)\ # VALUES ('{call}','{firstname}','{surname}','{created}','0','{created}','0','{created}');") # cur.execute(f'INSERT INTO `callbook_user` (`call`) VALUES ("{call}");') cur.execute(statement, data) except mariadb.Error as e: print(f'\n[WARN] MySQLError during execute statement\n\tArgs: {e.args}') except Exception as e: print('Error: {}'.format(e), file=sys.stderr) call_data_record.cnt = 0 def call_analyse_pdf(file, verbose, cur): # Define a regular expression to match tables reader = PdfReader(file) meta = reader.metadata if verbose: print(verbose) print(' Pages:', len(reader.pages)) # All of the following could be None! print(f' Author: {meta.author}') print(f' Creator: {meta.creator}') print(f'Producer: {meta.producer}') print(f' Subject: {meta.subject}') print(f' Title: {meta.title}') print(f' Created: {meta.creation_date}') print(f'Modified: {meta.modification_date}') for page in reader.pages: page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False) lines = page_text.strip().splitlines() for line in lines[3:-2]: line = line.strip() if verbose >= 2: print(line) call_data_record(line, meta.modification_date,verbose, cur) def exec_sql_file(cursor, sql_file): statement = '' try: for line in open(sql_file): if line.strip().startswith('--'): # ignore sql comment lines continue if line.strip().endswith(';'): # keep appending lines that don't end in ';' statement += line try: cursor.execute(statement) except mariadb.Error as e: # (OperationalError, ProgrammingError) as e: print(f'\n[WARN] MySQLError during execute statement\n\tArgs: {e.args}') statement = '' else: statement += line except FileNotFoundError: print(f'The file {path} was not found.') except Exception as e: print('Error: {}'.format(e), file=sys.stderr) if __name__ == '__main__': args = call_parser() try: filename = call_website(**vars(args)) if args.verbose > 1: print(f'Filename: {filename}') except Exception as e: print('Error: {}'.format(e), file=sys.stderr) sys.exit(1) try: conn = mariadb.connect( user = 'om', password = 'oe3tkt', host='127.0.0.1', port=3306, database='callbook' ) except mariadb.Error as e: print(f'Error connectiong to MariaDB platform: {e}') sys.exit(5) print(datetime.datetime.now(datetime.UTC)) # Get Cursor cur = conn.cursor() sql_file = '.sql_init' path = os.path.join(os.path.dirname(os.path.abspath(__file__)), sql_file) exec_sql_file(cur, path) call_analyse_pdf(filename,args.verbose,cur) conn.commit() cur.close() conn.close() sys.exit(0)