2024-06-13 03:03:39 +02:00
|
|
|
#!/usr/bin/env python3
|
|
|
|
|
|
|
|
import argparse
|
|
|
|
import os
|
|
|
|
import sys
|
2024-06-14 00:20:26 +02:00
|
|
|
import time
|
2024-06-16 07:38:30 +02:00
|
|
|
import pypdf
|
|
|
|
#from PyPDF2 import PdfReader
|
|
|
|
from pypdf import PdfReader
|
|
|
|
import re # regular expression
|
|
|
|
import pandas as pd
|
2024-06-13 03:03:39 +02:00
|
|
|
|
2024-06-14 00:20:26 +02:00
|
|
|
__version__ = '1.0.0'
|
|
|
|
__website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html'
|
2024-06-13 03:03:39 +02:00
|
|
|
|
|
|
|
from selenium import webdriver
|
|
|
|
from selenium.webdriver.common.by import By
|
2024-06-14 00:20:26 +02:00
|
|
|
from selenium.webdriver.chrome.service import Service as ChromiumService
|
2024-06-13 03:03:39 +02:00
|
|
|
|
2024-06-14 00:20:26 +02:00
|
|
|
def call_description():
|
|
|
|
print(f'Download and Parse the Austrian Callbook Version {__version__}')
|
2024-06-13 03:03:39 +02:00
|
|
|
|
2024-06-14 00:20:26 +02:00
|
|
|
def call_parser():
|
|
|
|
parser = argparse.ArgumentParser(
|
|
|
|
description='Download and Parse the Austrian Callbook',
|
|
|
|
epilog=f'''
|
|
|
|
Written by Thomas Kuschel,
|
|
|
|
Version {__version__}
|
|
|
|
'''
|
|
|
|
)
|
|
|
|
parser.add_argument('--interactive', '-i', action='store_true', default=False)
|
|
|
|
# parser.add_argument('--server', '-s', default=__website__, required=False)
|
|
|
|
parser.add_argument('--version', '-v', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__))
|
|
|
|
parser.add_argument('url', metavar='URL', nargs='?', default=__website__)
|
|
|
|
|
|
|
|
return parser.parse_args()
|
2024-06-13 03:03:39 +02:00
|
|
|
|
2024-06-14 00:20:26 +02:00
|
|
|
def call_website(url,interactive=False):
|
2024-06-13 03:03:39 +02:00
|
|
|
if(interactive):
|
|
|
|
print('Interactive')
|
2024-06-14 00:20:26 +02:00
|
|
|
driver=webdriver.Chrome()
|
|
|
|
else:
|
2024-06-13 03:03:39 +02:00
|
|
|
print('Headless Script')
|
2024-06-14 00:20:26 +02:00
|
|
|
options = webdriver.ChromeOptions()
|
|
|
|
options.add_argument('--headless')
|
|
|
|
options.add_argument('--no-sandbox')
|
|
|
|
options.add_argument('--disable-dev-shm-usage')
|
|
|
|
driver = webdriver.Chrome(options=options)
|
|
|
|
|
2024-06-13 03:03:39 +02:00
|
|
|
driver.get(url)
|
2024-06-14 00:20:26 +02:00
|
|
|
print(driver.title)
|
2024-06-15 23:40:34 +02:00
|
|
|
# elements = driver.find_elements(By.XPATH,'//a[contains(@href,"Rufzeichen")]')
|
|
|
|
elements = driver.find_elements(By.PARTIAL_LINK_TEXT,"Rufzeichen")
|
2024-06-14 00:20:26 +02:00
|
|
|
|
|
|
|
if elements:
|
|
|
|
element = elements[0]
|
2024-06-15 23:40:34 +02:00
|
|
|
href = element.get_attribute('href')
|
|
|
|
filename = element.click() # take the first one
|
2024-06-14 00:20:26 +02:00
|
|
|
else:
|
|
|
|
print('Sorry, no Link containing "Rufzeichen" found.')
|
|
|
|
driver.close()
|
|
|
|
sys.exit(2)
|
2024-06-13 03:03:39 +02:00
|
|
|
|
2024-06-14 00:20:26 +02:00
|
|
|
print(element.text)
|
2024-06-15 23:40:34 +02:00
|
|
|
# print(href)
|
2024-06-15 23:44:41 +02:00
|
|
|
if(interactive):
|
|
|
|
time.sleep(300)
|
2024-06-16 07:38:30 +02:00
|
|
|
else:
|
|
|
|
time.sleep(5)
|
2024-06-13 03:03:39 +02:00
|
|
|
|
|
|
|
driver.close()
|
2024-06-15 23:40:34 +02:00
|
|
|
return os.path.basename(href)
|
2024-06-13 03:03:39 +02:00
|
|
|
|
2024-06-16 07:38:30 +02:00
|
|
|
def get_pdf_content_lines(pdf_file_path):
|
|
|
|
with open(pdf_file_path) as f:
|
|
|
|
pdf_reader = PdfReader(f)
|
|
|
|
for page in pdf_reader.pages:
|
|
|
|
for line in page.extractText().spitlines():
|
|
|
|
yield line
|
|
|
|
|
|
|
|
def call_analyse_pdf(file):
|
|
|
|
|
|
|
|
# Define a regular expression to match tables
|
|
|
|
|
|
|
|
reader = PdfReader(file)
|
|
|
|
meta = reader.metadata
|
|
|
|
print(' Pages:', len(reader.pages))
|
|
|
|
# All of the following could be None!
|
|
|
|
print(f' Author: {meta.author}')
|
|
|
|
print(f' Creator: {meta.creator}')
|
|
|
|
print(f'Producer: {meta.producer}')
|
|
|
|
print(f' Subject: {meta.subject}')
|
|
|
|
print(f' Title: {meta.title}')
|
|
|
|
print(f' Created: {meta.creation_date}')
|
|
|
|
print(f'Modified: {meta.modification_date}')
|
|
|
|
for page in reader.pages:
|
|
|
|
page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
|
|
|
|
#print(page_text)
|
|
|
|
# page_text = page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0)
|
|
|
|
# print(page_text)
|
|
|
|
# Find all tables in page_text
|
|
|
|
calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
|
|
|
|
for call in calls:
|
|
|
|
print(call)
|
|
|
|
tables = re.findall(r'(?s)\b(?:\w+\s+){2,}\w+\b(?:\s*[,;]\s*\b(?:\w+\s+){2,}\w+\b)*', page_text)
|
|
|
|
# Loop through each table and create a pandas DataFrame
|
|
|
|
for table in tables:
|
|
|
|
# Split the table into rows
|
|
|
|
rows = table.strip().split('\n')
|
|
|
|
# Split the rows into cells
|
|
|
|
cells = [row.split('|') for row in rows]
|
|
|
|
# Remove leading and trailing whitespace from cells
|
|
|
|
cells = [[cell.strip() for cell in row] for row in cells]
|
|
|
|
# Remove empty rows and columns
|
|
|
|
cells = [[cell for cell in row if cell] for row in cells if row]
|
|
|
|
# Create a pandas DataFrame from the cells
|
|
|
|
df = pd.DataFrame(cells[1:], columns=cells[0])
|
|
|
|
|
|
|
|
# TODO: Clean and manipulate the df as needed
|
|
|
|
|
|
|
|
|
2024-06-13 03:03:39 +02:00
|
|
|
if __name__ == '__main__':
|
2024-06-14 00:20:26 +02:00
|
|
|
# call_description()
|
|
|
|
args = call_parser()
|
2024-06-13 03:03:39 +02:00
|
|
|
try:
|
|
|
|
filename = call_website(**vars(args))
|
2024-06-16 07:38:30 +02:00
|
|
|
print(f'Filename: {filename}')
|
|
|
|
call_analyse_pdf(filename)
|
2024-06-13 03:03:39 +02:00
|
|
|
sys.exit(0)
|
|
|
|
except Exception as e:
|
|
|
|
print('Error: {}'.format(e), file=sys.stderr)
|
|
|
|
sys.exit(1)
|