From 23bd746108f0f56220fb6b219ed47f8509caf1b9 Mon Sep 17 00:00:00 2001 From: Thomas Kuschel Date: Sun, 16 Jun 2024 07:38:30 +0200 Subject: [PATCH] Add parser, still work in progress --- afu/callbook.py | 60 ++++++++++++++++++++++++++++++++++++++++++++++--- 1 file changed, 57 insertions(+), 3 deletions(-) diff --git a/afu/callbook.py b/afu/callbook.py index 672aee2..e15c56e 100755 --- a/afu/callbook.py +++ b/afu/callbook.py @@ -4,6 +4,11 @@ import argparse import os import sys import time +import pypdf +#from PyPDF2 import PdfReader +from pypdf import PdfReader +import re # regular expression +import pandas as pd __version__ = '1.0.0' __website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html' @@ -59,19 +64,68 @@ def call_website(url,interactive=False): print(element.text) # print(href) if(interactive): - time.sleep(5) - else: time.sleep(300) + else: + time.sleep(5) driver.close() return os.path.basename(href) +def get_pdf_content_lines(pdf_file_path): + with open(pdf_file_path) as f: + pdf_reader = PdfReader(f) + for page in pdf_reader.pages: + for line in page.extractText().spitlines(): + yield line + +def call_analyse_pdf(file): + + # Define a regular expression to match tables + + reader = PdfReader(file) + meta = reader.metadata + print(' Pages:', len(reader.pages)) + # All of the following could be None! + print(f' Author: {meta.author}') + print(f' Creator: {meta.creator}') + print(f'Producer: {meta.producer}') + print(f' Subject: {meta.subject}') + print(f' Title: {meta.title}') + print(f' Created: {meta.creation_date}') + print(f'Modified: {meta.modification_date}') + for page in reader.pages: + page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False) + #print(page_text) + # page_text = page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0) + # print(page_text) + # Find all tables in page_text + calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text) + for call in calls: + print(call) + tables = re.findall(r'(?s)\b(?:\w+\s+){2,}\w+\b(?:\s*[,;]\s*\b(?:\w+\s+){2,}\w+\b)*', page_text) + # Loop through each table and create a pandas DataFrame + for table in tables: + # Split the table into rows + rows = table.strip().split('\n') + # Split the rows into cells + cells = [row.split('|') for row in rows] + # Remove leading and trailing whitespace from cells + cells = [[cell.strip() for cell in row] for row in cells] + # Remove empty rows and columns + cells = [[cell for cell in row if cell] for row in cells if row] + # Create a pandas DataFrame from the cells + df = pd.DataFrame(cells[1:], columns=cells[0]) + + # TODO: Clean and manipulate the df as needed + + if __name__ == '__main__': # call_description() args = call_parser() try: filename = call_website(**vars(args)) - print(f'File name: {filename}') + print(f'Filename: {filename}') + call_analyse_pdf(filename) sys.exit(0) except Exception as e: print('Error: {}'.format(e), file=sys.stderr)