Add parser, still work in progress

2024-06-16 07:38:30 +02:00
parent 503abb86a6
commit 23bd746108
1 changed files with 57 additions and 3 deletions
--- a/afu/callbook.py
+++ b/afu/callbook.py
@@ -4,6 +4,11 @@ import argparse
 import os
 import sys
 import time
+import pypdf
+#from PyPDF2 import PdfReader
+from pypdf import PdfReader
+import re # regular expression
+import pandas as pd

 __version__ = '1.0.0'
 __website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html'
@@ -59,19 +64,68 @@ def call_website(url,interactive=False):
 	print(element.text)
 	# print(href)
 	if(interactive):
-		time.sleep(5)
-	else:
 		time.sleep(300)
+	else:
+		time.sleep(5)

 	driver.close()
 	return os.path.basename(href)

+def get_pdf_content_lines(pdf_file_path):
+	with open(pdf_file_path) as f:
+		pdf_reader = PdfReader(f)
+		for page in pdf_reader.pages:
+			for line in page.extractText().spitlines():
+				yield line
+
+def call_analyse_pdf(file):
+
+	# Define a regular expression to match tables
+
+	reader = PdfReader(file)
+	meta = reader.metadata
+	print('   Pages:', len(reader.pages))
+	# All of the following could be None!
+	print(f'  Author: {meta.author}')
+	print(f' Creator: {meta.creator}')
+	print(f'Producer: {meta.producer}')
+	print(f' Subject: {meta.subject}')
+	print(f'   Title: {meta.title}')
+	print(f' Created: {meta.creation_date}')
+	print(f'Modified: {meta.modification_date}')
+	for page in reader.pages:
+		page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
+		#print(page_text)
+		# page_text = page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0)
+		# print(page_text)
+		# Find all tables in page_text
+		calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
+		for call in calls:
+			print(call)
+		tables = re.findall(r'(?s)\b(?:\w+\s+){2,}\w+\b(?:\s*[,;]\s*\b(?:\w+\s+){2,}\w+\b)*', page_text)
+		# Loop through each table and create a pandas DataFrame
+		for table in tables:
+			# Split the table into rows
+			rows = table.strip().split('\n')
+			# Split the rows into cells
+			cells = [row.split('|') for row in rows]
+			# Remove leading and trailing whitespace from cells
+			cells = [[cell.strip() for cell in row] for row in cells]
+			# Remove empty rows and columns
+			cells = [[cell for cell in row if cell] for row in cells if row]
+			# Create a pandas DataFrame from the cells
+			df = pd.DataFrame(cells[1:], columns=cells[0])
+
+			# TODO: Clean and manipulate the df as needed
+
+
 if __name__ == '__main__':
 	# call_description()
 	args = call_parser()
 	try:
 		filename = call_website(**vars(args))
-		print(f'File name: {filename}')
+		print(f'Filename: {filename}')
+		call_analyse_pdf(filename)
 		sys.exit(0)
 	except Exception as e:
 		print('Error: {}'.format(e), file=sys.stderr)