2024-06-13 03:03:39 +02:00
#!/usr/bin/env python3
import argparse
import os
2024-06-19 19:51:08 +02:00
import mariadb
2024-06-13 03:03:39 +02:00
import sys
2024-06-14 00:20:26 +02:00
import time
2024-06-16 07:38:30 +02:00
import pypdf
from pypdf import PdfReader
import re # regular expression
2024-06-19 19:51:08 +02:00
import datetime
2024-06-13 03:03:39 +02:00
2024-06-14 00:20:26 +02:00
__version__ = ' 1.0.0 '
__website__ = ' https://www.fb.gv.at/Funk/amateurfunkdienst.html '
2024-06-13 03:03:39 +02:00
from selenium import webdriver
from selenium . webdriver . common . by import By
2024-06-14 00:20:26 +02:00
from selenium . webdriver . chrome . service import Service as ChromiumService
2024-06-13 03:03:39 +02:00
2024-06-14 00:20:26 +02:00
def call_description ( ) :
print ( f ' Download and Parse the Austrian Callbook Version { __version__ } ' )
2024-06-13 03:03:39 +02:00
2024-06-14 00:20:26 +02:00
def call_parser ( ) :
parser = argparse . ArgumentParser (
description = ' Download and Parse the Austrian Callbook ' ,
epilog = f '''
Written by Thomas Kuschel ,
Version { __version__ }
'''
)
2024-06-16 17:00:46 +02:00
parser . add_argument ( ' -i ' , ' --interactive ' , action = ' store_true ' , default = False )
# parser.add_argument('-s', '--server' default=__website__, required=False)
parser . add_argument ( ' -V ' , ' --version ' , action = ' version ' , version = ' {} {} ' . format ( os . path . split ( __file__ ) [ 1 ] , __version__ ) )
parser . add_argument ( ' -v ' , ' --verbose ' , action = ' append_const ' , const = 1 )
2024-06-19 19:51:08 +02:00
# Rufzeichenliste_AT_Stand_010624.pdf
parser . add_argument ( ' -p ' , ' --path ' , default = ' afu/Rufzeichenliste_AT_Stand_010624.pdf ' , help = ' skip the download if the specified path to a PDF file exists ' )
2024-06-17 19:16:28 +02:00
# parser.add_argument('-t', '--type', default='' , help='specify the output, supported types are [ CSV | JSON ]') # not implemented yet
parser . add_argument ( ' -o ' , ' --output ' , default = ' ' , help = ' specify the file where the data are written to, default stdout ' )
parser . add_argument ( ' -m ' , ' --mariadb ' , help = ' SQL interface to MariaDB (MySql) format " <IP-Address>:<Port> <User> <Passwd> " or defined in .config ' )
2024-06-14 00:20:26 +02:00
parser . add_argument ( ' url ' , metavar = ' URL ' , nargs = ' ? ' , default = __website__ )
2024-06-16 17:00:46 +02:00
opt = parser . parse_args ( )
opt . verbose = 0 if opt . verbose is None else sum ( opt . verbose )
return opt
2024-06-17 19:16:28 +02:00
def call_website ( url , verbose , path = ' ' , interactive = False , output = ' ' , mariadb = ' ' ) :
2024-06-16 17:00:46 +02:00
if path :
if os . path . exists ( path ) :
return path
else :
print ( f ' The given path " { path } " does not exist. ' )
sys . exit ( 3 )
2024-06-13 03:03:39 +02:00
if ( interactive ) :
print ( ' Interactive ' )
2024-06-14 00:20:26 +02:00
driver = webdriver . Chrome ( )
else :
2024-06-13 03:03:39 +02:00
print ( ' Headless Script ' )
2024-06-14 00:20:26 +02:00
options = webdriver . ChromeOptions ( )
options . add_argument ( ' --headless ' )
options . add_argument ( ' --no-sandbox ' )
options . add_argument ( ' --disable-dev-shm-usage ' )
driver = webdriver . Chrome ( options = options )
2024-06-13 03:03:39 +02:00
driver . get ( url )
2024-06-14 00:20:26 +02:00
print ( driver . title )
2024-06-15 23:40:34 +02:00
# elements = driver.find_elements(By.XPATH,'//a[contains(@href,"Rufzeichen")]')
elements = driver . find_elements ( By . PARTIAL_LINK_TEXT , " Rufzeichen " )
2024-06-14 00:20:26 +02:00
if elements :
element = elements [ 0 ]
2024-06-15 23:40:34 +02:00
href = element . get_attribute ( ' href ' )
filename = element . click ( ) # take the first one
2024-06-14 00:20:26 +02:00
else :
print ( ' Sorry, no Link containing " Rufzeichen " found. ' )
driver . close ( )
sys . exit ( 2 )
2024-06-13 03:03:39 +02:00
2024-06-14 00:20:26 +02:00
print ( element . text )
2024-06-15 23:40:34 +02:00
# print(href)
2024-06-15 23:44:41 +02:00
if ( interactive ) :
time . sleep ( 300 )
2024-06-16 07:38:30 +02:00
else :
2024-06-17 19:16:28 +02:00
time . sleep ( 4 )
2024-06-13 03:03:39 +02:00
driver . close ( )
2024-06-15 23:40:34 +02:00
return os . path . basename ( href )
2024-06-13 03:03:39 +02:00
2024-06-16 17:00:46 +02:00
def remove_first_quote_if_odd ( text , verbose = 0 ) :
double_quote_cnt = text . count ( ' " ' )
# single_quote_cnt = text.count("'")
if ( double_quote_cnt % 2 != 0 ) : # or (single_quote_cnt % 2 != 0):
# Find and remove the first quote
for i , char in enumerate ( text ) :
if char in [ ' " ' ] : # ['"', "'"]:
2024-06-17 19:16:28 +02:00
if ( verbose > 1 ) :
2024-06-16 17:00:46 +02:00
print ( text )
text = text [ : i ] + text [ i + 1 : ]
2024-06-17 19:16:28 +02:00
if ( verbose > 1 ) :
2024-06-16 17:00:46 +02:00
print ( text )
break
return text
def is_clubstation ( call ) :
assert ( len ( call ) > 3 )
2024-06-17 19:16:28 +02:00
if call [ 3 ] . upper ( ) == ' X ' or call . upper ( ) == ' OE5SIX ' : # special case with OE5SIX (Clubstation)
2024-06-16 17:00:46 +02:00
return True
2024-06-19 22:45:21 +02:00
2024-06-16 17:00:46 +02:00
return False
2024-06-17 19:16:28 +02:00
2024-06-16 17:00:46 +02:00
def replace_substring_with_line ( path , search_substring , verbose = 0 ) :
2024-06-17 20:59:16 +02:00
2024-06-16 17:00:46 +02:00
try :
2024-06-17 19:16:28 +02:00
if not replace_substring_with_line . lines :
with open ( path , ' r ' ) as file :
replace_substring_with_line . lines = file . readlines ( )
for line in replace_substring_with_line . lines :
if search_substring [ 0 : 46 ] . lower ( ) in line . lower ( ) :
2024-06-17 20:59:16 +02:00
return line . strip ( )
2024-06-16 17:00:46 +02:00
except FileNotFoundError :
print ( f ' The file { path } was not found. ' )
except Exception as e :
print ( f ' An error occurred: { e } ' )
2024-06-19 22:45:21 +02:00
2024-06-16 17:00:46 +02:00
return search_substring
2024-06-17 19:16:28 +02:00
replace_substring_with_line . lines = None
def gender_substring ( path , search_substring , verbose = 0 ) :
try :
if not gender_substring . lines :
with open ( path , ' r ' ) as file :
gender_substring . lines = file . readlines ( )
for line in gender_substring . lines :
2024-06-19 22:45:21 +02:00
if line [ 2 : ] . strip ( ) == search_substring : # search from position 2 and remove all spaces or \n chars
return line [ 0 ] # return the char of gender i.e. 'f' or 'm'
2024-06-17 19:16:28 +02:00
except FileNotFoundError :
print ( f ' The file { path } was not found. ' )
except Exception as e :
print ( f ' An error occurred: { e } ' )
return ' x ' # not found, unknown gender
gender_substring . lines = None
def get_gender ( firstnames , surname , call , verbose = 0 ) :
# load the .gender file:
genderfile = ' .gender '
gender = ' x '
gpath = os . path . join ( os . path . dirname ( os . path . abspath ( __file__ ) ) , genderfile )
if os . path . exists ( gpath ) :
firstname = firstnames . split ( ' ' , 1 ) [ 0 ]
gender = gender_substring ( gpath , firstname , verbose = 0 ) # only check 1st/firstname of name, important when there are more than 1 firstnames
if gender == ' x ' :
if verbose > 0 :
get_gender . cnt + = 1
print ( f ' ( { get_gender . cnt } ) { call } " { firstname } " [ { firstnames } { surname } ] not found in file { genderfile } - gender " x " is set. ' )
return gender
get_gender . cnt = 0
def call_split_name ( fullname , call , verbose ) :
assert ( len ( fullname ) > 1 )
name = fullname . split ( ' ' , 1 )
surname = name [ 0 ]
2024-06-19 22:45:21 +02:00
# several special cases like surname "de Lijezer", "van Dijk", "el Shamaa", "da Silva", etc.
2024-06-17 19:16:28 +02:00
match surname . lower ( ) :
2024-06-19 22:45:21 +02:00
case ' de ' | ' el ' | ' da ' :
2024-06-17 19:16:28 +02:00
name = fullname [ 3 : ] . split ( ' ' , 1 )
surname = surname . lower ( ) + ' ' + name [ 0 ]
2024-06-19 19:51:08 +02:00
if verbose > 1 :
2024-06-17 19:16:28 +02:00
print ( f ' ## { fullname } --> { surname } ## ' )
case ' van ' | ' von ' :
name = fullname [ 4 : ] . split ( ' ' , 1 )
surname = surname . lower ( ) + ' ' + name [ 0 ]
if surname . lower ( ) in [ ' van der ' , ' von der ' , ' van den ' ] : # e.g. "van der Meulen", "Walther von der Vogelweide", "Annie van den Berg"
name = fullname [ 8 : ] . split ( ' ' , 1 )
surname = surname . lower ( ) + ' ' + name [ 0 ]
2024-06-19 19:51:08 +02:00
if verbose > 1 :
2024-06-17 19:16:28 +02:00
print ( f ' ## { fullname } --> { surname } ## ' )
case ' della ' : # Ancient Italian noble family "della Rowere"
name = fullname [ 6 : ] . split ( ' ' , 1 )
surname = surname . lower ( ) + ' ' + name [ 0 ]
2024-06-19 19:51:08 +02:00
if verbose > 1 :
2024-06-17 19:16:28 +02:00
print ( f ' ## { fullname } --> { surname } ## ' )
if len ( name ) > 1 :
firstname = name [ 1 ]
else :
firstname = ' <unknown> '
# In Austria the call suffix starting with Y is an YL (young lady)
# if call[3].upper() == 'Y':
if False :
gender = ' f '
else :
gender = get_gender ( firstname , surname , call , verbose )
return firstname , surname , gender
def fix_typo ( call , fullname , verbose = 1 ) :
fixtypofile = ' .typo_callbook '
path = os . path . join ( os . path . dirname ( os . path . abspath ( __file__ ) ) , fixtypofile )
if os . path . exists ( path ) :
try :
if not fix_typo . lines :
with open ( path , ' r ' ) as file :
fix_typo . lines = file . readlines ( )
2024-06-17 20:01:16 +02:00
if verbose > 0 :
print ( f ' File " { fixtypofile } " : ' )
for line in fix_typo . lines :
print ( f ' >> { line . rstrip ( ) } ' )
print ( ' >> ** EOF ** ' )
2024-06-17 19:16:28 +02:00
for line in fix_typo . lines [ 4 : ] : # starting with line 4
if len ( fix_typo . spaces ) == 0 : # not initialized
if line [ 0 ] == ' * ' :
words = line . split ( )
assert len ( words ) == 4 # i.e. '*, call, nachname, vorname'
fix_typo . spaces = [ line . index ( words [ 1 ] ) , line . index ( words [ 2 ] ) , line . index ( words [ 3 ] ) ]
else :
if call in line [ 2 : 8 ] :
2024-06-17 20:59:16 +02:00
if verbose > 1 :
2024-06-17 20:01:16 +02:00
print ( f ' Call: { call } found ' )
2024-06-17 19:16:28 +02:00
match line [ 0 ] :
case ' # ' :
if verbose > 1 :
print ( line . rstrip ( ) )
case ' F ' :
if verbose > 0 :
print ( line . rstrip ( ) )
2024-06-17 20:59:16 +02:00
print ( fullname )
2024-06-17 19:16:28 +02:00
firstname1 , surname1 , gender1 = call_split_name ( fullname , call , 0 )
fullname2 = line [ fix_typo . spaces [ 1 ] : fix_typo . spaces [ 2 ] - 1 ] . rstrip ( ) + ' ' + line [ fix_typo . spaces [ 2 ] : - 1 ]
if verbose > 0 :
print ( fullname2 )
firstname2 , surname2 , gender2 = call_split_name ( fullname2 , call , 0 )
# Hardening: at a minimum, either the firstnames or the surenames must fit
fix_cnt = 0
if ( firstname1 != firstname2 ) :
fix_cnt + = 1
if ( surname1 != surname2 ) :
fix_cnt + = 1
if fix_cnt == 0 :
print ( f ' It is fixed! You can remove the line with the item { call } from the file { fixtypofile } ! ' )
elif fix_cnt > 1 :
print ( f ' Something went wrong, there are several bugs. Check line with call { call } in file { fixtypofile } ! ' )
else :
fullname = fullname2
case ' X ' : # exchange the surname with firstname
if verbose > 0 :
print ( line . rstrip ( ) )
2024-06-17 20:59:16 +02:00
print ( fullname )
2024-06-17 19:16:28 +02:00
firstname1 , surname1 , gender1 = call_split_name ( fullname , call , 0 )
fullname2 = line [ fix_typo . spaces [ 1 ] : fix_typo . spaces [ 2 ] - 1 ] . rstrip ( ) + ' ' + line [ fix_typo . spaces [ 2 ] : - 1 ]
if verbose > 0 :
print ( fullname2 )
firstname2 , surname2 , gender2 = call_split_name ( fullname2 , call , 0 )
fix_cnt = 0
if ( firstname1 == firstname2 ) and ( surname1 == surname2 ) :
print ( f ' It is fixed! You can remove the line with the item { call } from the file { fixtypofile } ! ' )
elif ( firstname1 != surname2 ) or ( surname1 != firstname2 ) :
print ( f ' Something went wrong, there are several bugs. Check line with call { call } in file { fixtypofile } ! ' )
else :
fullname = fullname2
except FileNotFoundError :
print ( f ' The file { path } was not found. ' )
except Exception as e :
print ( f ' An error occurred: { e } ' )
return fullname
fix_typo . lines = None
fix_typo . spaces = [ ]
2024-06-16 17:00:46 +02:00
2024-06-19 19:51:08 +02:00
def call_data_record ( line , mod_date , verbose , cur ) :
2024-06-16 17:00:46 +02:00
# we have to split the record with a cost-intensive regular expression
# record = re.split('OE[0-9][A-Z]{1,3}[ \t]{3,20}',line) # this does not work 100%
# record = re.findall(r'(OE[0-9][A-Z]{1,3})[ \t]{2,12}([A-ZÄÖÜ].+[ ]?.*[ ]?.*[ ]?.*)[ \t]{3,30}(.{3,30})[ \t]{3,30}([1,3,4])', line)
# record = re.search(r'(OE[0-9][A-Z]{1,3})[ ]{2,12}([. ]+)[ ]{3,50}([. ]+)[ ]{3,50}([1-4])', line)
# record = re.search(r'^(OE[0-9][A-Z]{1,3})[ \t]{2,20}([\w ]{1,12})[ ]{3,50}(.*)([1-4]{1})$', line)
# Never split Addresses containing 2 or 3 spaces, also several records contain no address or no location
records = re . split ( r ' [ ] { 4,65} ' , line )
# [records for record in records]
if verbose > 2 :
print ( f ' Record length: { len ( records ) } ' )
for m in records :
print ( m )
# HARDENING:
assert ( len ( records ) == 5 )
# OE Call:
call = records [ 0 ]
match = re . search ( r ' ^(OE[0-9][A-Z] { 1,3}) ' , call )
assert ( match . string == call )
fullname = records [ 1 ]
2024-06-17 19:16:28 +02:00
location = records [ 2 ]
address = records [ 3 ]
permit_class = records [ 4 ]
fullname = fix_typo ( call , fullname , verbose )
2024-06-19 19:51:08 +02:00
firstname = ' '
surname = ' '
2024-06-16 17:00:46 +02:00
# If there is a clubstation
if is_clubstation ( call ) :
# Name starting with only one quotation marks e.g. " -- remove that one:
fullname = remove_first_quote_if_odd ( fullname , verbose ) # only found @ clubstations
clubstationfile = ' .callbook_club '
2024-06-17 19:16:28 +02:00
if verbose > 1 :
2024-06-16 17:00:46 +02:00
print ( f ' Call: { call } , Name: { fullname } ' )
path = os . path . join ( os . path . dirname ( os . path . abspath ( __file__ ) ) , clubstationfile )
if os . path . exists ( path ) :
fullname = replace_substring_with_line ( path , fullname , verbose )
2024-06-17 19:16:28 +02:00
gender = ' * '
2024-06-19 19:51:08 +02:00
firstname = fullname . strip ( )
2024-06-17 19:16:28 +02:00
elif fullname [ 0 ] == ' * ' :
gender = ' * '
else : # Try to split the YL or OMs Name, guess the gender
firstname , surname , gender = call_split_name ( fullname , call , verbose )
if verbose > 1 :
if gender == ' * ' :
print ( f ' Call: { call } , Name: { fullname } , Gender: { gender } ' )
else :
print ( f ' Call: { call } , First Name: { firstname } , Surname: { surname } , Gender: { gender } ' )
2024-06-16 17:00:46 +02:00
2024-06-17 19:16:28 +02:00
print ( f ' Location: { location } , Address: { address } , Permit: { permit_class } ' )
2024-06-16 17:00:46 +02:00
2024-06-19 19:51:08 +02:00
created = datetime . datetime . now ( ) . strftime ( ' % Y- % m- %d % H: % M: % S ' )
call_data_record . cnt + = 1 # increment the User_id
user_id = call_data_record . cnt
# print(created)
statement = " INSERT INTO `callbook_user`(`user_id`,`call`,`firstname`,`surname`,`gender`,`created`,`modified`) VALUES ( %s , %s , %s , %s , %s , %s , %s ) "
data = ( user_id , call , firstname , surname , gender , created , created )
2024-06-16 17:00:46 +02:00
2024-06-19 19:51:08 +02:00
try :
# cur.execute(f"INSERT INTO `callbook_user` (`call`,`firstname`,`surname`,\
# `created`,`created_by`,`modified`,`modified_by`,`active`)\
# VALUES ('{call}','{firstname}','{surname}','{created}','0','{created}','0','{created}');")
# cur.execute(f'INSERT INTO `callbook_user` (`call`) VALUES ("{call}");')
cur . execute ( statement , data )
except mariadb . Error as e :
print ( f ' \n [WARN] MySQLError during execute statement \n \t Args: { e . args } ' )
except Exception as e :
print ( ' Error: {} ' . format ( e ) , file = sys . stderr )
call_data_record . cnt = 0
def call_analyse_pdf ( file , verbose , cur ) :
2024-06-16 07:38:30 +02:00
# Define a regular expression to match tables
reader = PdfReader ( file )
meta = reader . metadata
2024-06-16 17:00:46 +02:00
if verbose :
print ( verbose )
2024-06-19 19:51:08 +02:00
print ( ' Pages: ' , len ( reader . pages ) )
2024-06-16 17:00:46 +02:00
# All of the following could be None!
print ( f ' Author: { meta . author } ' )
print ( f ' Creator: { meta . creator } ' )
print ( f ' Producer: { meta . producer } ' )
print ( f ' Subject: { meta . subject } ' )
print ( f ' Title: { meta . title } ' )
print ( f ' Created: { meta . creation_date } ' )
print ( f ' Modified: { meta . modification_date } ' )
2024-06-17 19:16:28 +02:00
2024-06-16 07:38:30 +02:00
for page in reader . pages :
page_text = page . extract_text ( extraction_mode = " layout " , layout_mode_space_vertically = False )
2024-06-16 17:00:46 +02:00
lines = page_text . strip ( ) . splitlines ( )
for line in lines [ 3 : - 2 ] :
line = line . strip ( )
2024-06-17 19:16:28 +02:00
2024-06-16 17:00:46 +02:00
if verbose > = 2 :
print ( line )
2024-06-19 19:51:08 +02:00
call_data_record ( line , meta . modification_date , verbose , cur )
def exec_sql_file ( cursor , sql_file ) :
statement = ' '
try :
for line in open ( sql_file ) :
if line . strip ( ) . startswith ( ' -- ' ) : # ignore sql comment lines
continue
if line . strip ( ) . endswith ( ' ; ' ) : # keep appending lines that don't end in ';'
statement + = line
try :
cursor . execute ( statement )
except mariadb . Error as e : # (OperationalError, ProgrammingError) as e:
print ( f ' \n [WARN] MySQLError during execute statement \n \t Args: { e . args } ' )
statement = ' '
else :
statement + = line
except FileNotFoundError :
print ( f ' The file { path } was not found. ' )
except Exception as e :
print ( ' Error: {} ' . format ( e ) , file = sys . stderr )
2024-06-16 07:38:30 +02:00
2024-06-13 03:03:39 +02:00
if __name__ == ' __main__ ' :
2024-06-14 00:20:26 +02:00
args = call_parser ( )
2024-06-13 03:03:39 +02:00
try :
filename = call_website ( * * vars ( args ) )
2024-06-17 19:16:28 +02:00
if args . verbose > 1 :
print ( f ' Filename: { filename } ' )
2024-06-19 19:51:08 +02:00
2024-06-13 03:03:39 +02:00
except Exception as e :
print ( ' Error: {} ' . format ( e ) , file = sys . stderr )
sys . exit ( 1 )
2024-06-19 19:51:08 +02:00
try :
conn = mariadb . connect (
user = ' om ' ,
password = ' oe3tkt ' ,
host = ' 127.0.0.1 ' ,
port = 3306 ,
database = ' callbook '
)
except mariadb . Error as e :
print ( f ' Error connectiong to MariaDB platform: { e } ' )
sys . exit ( 5 )
print ( datetime . datetime . now ( datetime . UTC ) )
# Get Cursor
cur = conn . cursor ( )
sql_file = ' .sql_init '
path = os . path . join ( os . path . dirname ( os . path . abspath ( __file__ ) ) , sql_file )
exec_sql_file ( cur , path )
call_analyse_pdf ( filename , args . verbose , cur )
conn . commit ( )
cur . close ( )
conn . close ( )
sys . exit ( 0 )