Add parser, still work in progress

This commit is contained in:
Thomas Kuschel 2024-06-16 07:38:30 +02:00
parent 503abb86a6
commit 23bd746108

View File

@ -4,6 +4,11 @@ import argparse
import os
import sys
import time
import pypdf
#from PyPDF2 import PdfReader
from pypdf import PdfReader
import re # regular expression
import pandas as pd
__version__ = '1.0.0'
__website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html'
@ -59,19 +64,68 @@ def call_website(url,interactive=False):
print(element.text)
# print(href)
if(interactive):
time.sleep(5)
else:
time.sleep(300)
else:
time.sleep(5)
driver.close()
return os.path.basename(href)
def get_pdf_content_lines(pdf_file_path):
with open(pdf_file_path) as f:
pdf_reader = PdfReader(f)
for page in pdf_reader.pages:
for line in page.extractText().spitlines():
yield line
def call_analyse_pdf(file):
# Define a regular expression to match tables
reader = PdfReader(file)
meta = reader.metadata
print(' Pages:', len(reader.pages))
# All of the following could be None!
print(f' Author: {meta.author}')
print(f' Creator: {meta.creator}')
print(f'Producer: {meta.producer}')
print(f' Subject: {meta.subject}')
print(f' Title: {meta.title}')
print(f' Created: {meta.creation_date}')
print(f'Modified: {meta.modification_date}')
for page in reader.pages:
page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
#print(page_text)
# page_text = page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0)
# print(page_text)
# Find all tables in page_text
calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
for call in calls:
print(call)
tables = re.findall(r'(?s)\b(?:\w+\s+){2,}\w+\b(?:\s*[,;]\s*\b(?:\w+\s+){2,}\w+\b)*', page_text)
# Loop through each table and create a pandas DataFrame
for table in tables:
# Split the table into rows
rows = table.strip().split('\n')
# Split the rows into cells
cells = [row.split('|') for row in rows]
# Remove leading and trailing whitespace from cells
cells = [[cell.strip() for cell in row] for row in cells]
# Remove empty rows and columns
cells = [[cell for cell in row if cell] for row in cells if row]
# Create a pandas DataFrame from the cells
df = pd.DataFrame(cells[1:], columns=cells[0])
# TODO: Clean and manipulate the df as needed
if __name__ == '__main__':
# call_description()
args = call_parser()
try:
filename = call_website(**vars(args))
print(f'Filename: {filename}')
call_analyse_pdf(filename)
sys.exit(0)
except Exception as e:
print('Error: {}'.format(e), file=sys.stderr)