scripts/afu/callbook.py

#!/usr/bin/env python3

import argparse
import os
import sys
import time
import pypdf
#from PyPDF2 import PdfReader
from pypdf import PdfReader
import re # regular expression
import pandas as pd

__version__ = '1.0.0'
__website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html'

from selenium import webdriver
from selenium.webdriver.common.by import By
from selenium.webdriver.chrome.service import Service as ChromiumService

def call_description():
	print(f'Download and Parse the Austrian Callbook Version {__version__}')

def call_parser():
	parser = argparse.ArgumentParser(
		description='Download and Parse the Austrian Callbook',
		epilog=f'''
		Written by Thomas Kuschel,
		Version {__version__}
		'''
	)
	parser.add_argument('-i', '--interactive', action='store_true', default=False)
	# parser.add_argument('-s', '--server' default=__website__, required=False)
	parser.add_argument('-V', '--version', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__))
	parser.add_argument('-v', '--verbose', action='append_const', const = 1)
	parser.add_argument('-p', '--path', default='Rufzeichenliste_AT_Stand_010624.pdf', help= 'skip the download if the specified path to a PDF file exists')
	parser.add_argument('url', metavar='URL', nargs='?', default=__website__)

	opt = parser.parse_args()
	opt.verbose = 0 if opt.verbose is None else sum(opt.verbose)
	return opt

def call_website(url,verbose,path='',interactive=False):

	if path:
		if os.path.exists(path):
			return path
		else:
			print(f'The given path "{path}" does not exist.')
			sys.exit(3)

	if(interactive):
		print('Interactive')
		driver=webdriver.Chrome()
	else:
		print('Headless Script')
		options = webdriver.ChromeOptions()
		options.add_argument('--headless')
		options.add_argument('--no-sandbox')
		options.add_argument('--disable-dev-shm-usage')
		driver = webdriver.Chrome(options=options)

	driver.get(url)
	print(driver.title)
	# elements = driver.find_elements(By.XPATH,'//a[contains(@href,"Rufzeichen")]')
	elements = driver.find_elements(By.PARTIAL_LINK_TEXT,"Rufzeichen")

	if elements:
		element = elements[0]
		href = element.get_attribute('href')
		filename = element.click() # take the first one
	else:
		print('Sorry, no Link containing "Rufzeichen" found.')
		driver.close()
		sys.exit(2)

	print(element.text)
	# print(href)
	if(interactive):
		time.sleep(300)
	else:
		time.sleep(5)

	driver.close()
	return os.path.basename(href)

def remove_first_quote_if_odd(text, verbose = 0):
	double_quote_cnt = text.count('"')
	# single_quote_cnt = text.count("'")

	if (double_quote_cnt % 2 != 0): # or (single_quote_cnt % 2 != 0):
		# Find and remove the first quote
		for i, char in enumerate(text):
				if char in ['"']: # ['"', "'"]:
					if (verbose > 0):
						print(text)
					text = text[:i] + text[i+1:]
					if (verbose > 0):
						print(text)
					break
	return text

def is_clubstation(call):
	assert(len(call) > 3)
	if call[3].upper() == 'X':
		return True
	return False

def replace_substring_with_line(path, search_substring, verbose=0):

	try:
		with open(path, 'r') as file:
			lines = file.readlines()

		search_substring
		for line in lines:
			if search_substring[0:40].lower() in line.lower():
				modified_line = line.strip()
				# Replace the substring with the whole line
				## line = line.lower().replace(search_substring.lower(), modified_line)
				modified_line = line
				return modified_line

	except FileNotFoundError:
		print(f'The file {path} was not found.')
	except Exception as e:
		print(f'An error occurred: {e}')

	return search_substring


def call_data_record(line, mod_date, verbose):

	# we have to split the record with a cost-intensive regular expression
	# record = re.split('OE[0-9][A-Z]{1,3}[ \t]{3,20}',line) # this does not work 100%
	# record = re.findall(r'(OE[0-9][A-Z]{1,3})[ \t]{2,12}([A-ZÄÖÜ].+[ ]?.*[ ]?.*[ ]?.*)[ \t]{3,30}(.{3,30})[ \t]{3,30}([1,3,4])', line)
	# record = re.search(r'(OE[0-9][A-Z]{1,3})[ ]{2,12}([. ]+)[ ]{3,50}([. ]+)[ ]{3,50}([1-4])', line)
	# record = re.search(r'^(OE[0-9][A-Z]{1,3})[ \t]{2,20}([\w ]{1,12})[ ]{3,50}(.*)([1-4]{1})$', line)

	# Never split Addresses containing 2 or 3 spaces, also several records contain no address or no location
	records = re.split(r'[ ]{4,65}', line)
	# [records for record in records]

	if verbose > 2 :
		print(f'Record length: {len(records)}')

		for m in records:
			print(m)

	# HARDENING:
	assert(len(records) == 5)
	# OE Call:
	call = records[0]
	match = re.search(r'^(OE[0-9][A-Z]{1,3})', call)
	assert(match.string == call)
	fullname = records[1]
	# If there is a clubstation
	if is_clubstation(call):
		# Name starting with only one quotation marks e.g. " -- remove that one:
		fullname = remove_first_quote_if_odd(fullname, verbose) # only found @ clubstations
		clubstationfile = '.callbook_club'
		if verbose > 0:
			print(f'Call: {call}, Name: {fullname}')
		path = os.path.join(os.path.dirname(os.path.abspath(__file__)), clubstationfile)
		if os.path.exists(path):
			fullname = replace_substring_with_line(path, fullname, verbose)
		if verbose > 0:
			print(f'Call: {call}, Name: {fullname}')


#	if not record:
#		return
#	if verbose == 1:
#		print(record.group(1))
#	if verbose >= 3:
#		print(f'Call: {record.group(1)}')
#		print(f'Name: {record.group(2)}')
#		#print(f'Location: {record[3]}')
#		#print(f'Address: {record[4]}')
#		#print(f'Permit Class: {record[5]}')

def call_analyse_pdf(file, verbose):

	# Define a regular expression to match tables

	reader = PdfReader(file)
	meta = reader.metadata
	if verbose:
		print(verbose)
		print('   Pages:', len(reader.pages))
		# All of the following could be None!
		print(f'  Author: {meta.author}')
		print(f' Creator: {meta.creator}')
		print(f'Producer: {meta.producer}')
		print(f' Subject: {meta.subject}')
		print(f'   Title: {meta.title}')
		print(f' Created: {meta.creation_date}')
		print(f'Modified: {meta.modification_date}')

	for page in reader.pages:
		page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
		lines = page_text.strip().splitlines()
		for line in lines[3:-2]:
			line = line.strip()
			# calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
			if verbose >= 2:
				print(line)
			call_data_record(line, meta.modification_date,verbose)

if __name__ == '__main__':
	# call_description()
	args = call_parser()
	# filename = 'Rufzeichenliste_AT_Stand_010624.pdf'
	try:
		filename = call_website(**vars(args))

		print(f'Filename: {filename}')
		call_analyse_pdf(filename,args.verbose)
		sys.exit(0)
	except Exception as e:
		print('Error: {}'.format(e), file=sys.stderr)
		sys.exit(1)