Add parser, still work in progress
This commit is contained in:
		@@ -4,6 +4,11 @@ import argparse
 | 
			
		||||
import os
 | 
			
		||||
import sys
 | 
			
		||||
import time
 | 
			
		||||
import pypdf
 | 
			
		||||
#from PyPDF2 import PdfReader
 | 
			
		||||
from pypdf import PdfReader
 | 
			
		||||
import re # regular expression
 | 
			
		||||
import pandas as pd
 | 
			
		||||
 | 
			
		||||
__version__ = '1.0.0'
 | 
			
		||||
__website__ = 'https://www.fb.gv.at/Funk/amateurfunkdienst.html'
 | 
			
		||||
@@ -59,19 +64,68 @@ def call_website(url,interactive=False):
 | 
			
		||||
	print(element.text)
 | 
			
		||||
	# print(href)
 | 
			
		||||
	if(interactive):
 | 
			
		||||
		time.sleep(5)
 | 
			
		||||
	else:
 | 
			
		||||
		time.sleep(300)
 | 
			
		||||
	else:
 | 
			
		||||
		time.sleep(5)
 | 
			
		||||
 | 
			
		||||
	driver.close()
 | 
			
		||||
	return os.path.basename(href)
 | 
			
		||||
 | 
			
		||||
def get_pdf_content_lines(pdf_file_path):
 | 
			
		||||
	with open(pdf_file_path) as f:
 | 
			
		||||
		pdf_reader = PdfReader(f)
 | 
			
		||||
		for page in pdf_reader.pages:
 | 
			
		||||
			for line in page.extractText().spitlines():
 | 
			
		||||
				yield line
 | 
			
		||||
 | 
			
		||||
def call_analyse_pdf(file):
 | 
			
		||||
 | 
			
		||||
	# Define a regular expression to match tables
 | 
			
		||||
 | 
			
		||||
	reader = PdfReader(file)
 | 
			
		||||
	meta = reader.metadata
 | 
			
		||||
	print('   Pages:', len(reader.pages))
 | 
			
		||||
	# All of the following could be None!
 | 
			
		||||
	print(f'  Author: {meta.author}')
 | 
			
		||||
	print(f' Creator: {meta.creator}')
 | 
			
		||||
	print(f'Producer: {meta.producer}')
 | 
			
		||||
	print(f' Subject: {meta.subject}')
 | 
			
		||||
	print(f'   Title: {meta.title}')
 | 
			
		||||
	print(f' Created: {meta.creation_date}')
 | 
			
		||||
	print(f'Modified: {meta.modification_date}')
 | 
			
		||||
	for page in reader.pages:
 | 
			
		||||
		page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
 | 
			
		||||
		#print(page_text)
 | 
			
		||||
		# page_text = page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0)
 | 
			
		||||
		# print(page_text)
 | 
			
		||||
		# Find all tables in page_text
 | 
			
		||||
		calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
 | 
			
		||||
		for call in calls:
 | 
			
		||||
			print(call)
 | 
			
		||||
		tables = re.findall(r'(?s)\b(?:\w+\s+){2,}\w+\b(?:\s*[,;]\s*\b(?:\w+\s+){2,}\w+\b)*', page_text)
 | 
			
		||||
		# Loop through each table and create a pandas DataFrame
 | 
			
		||||
		for table in tables:
 | 
			
		||||
			# Split the table into rows
 | 
			
		||||
			rows = table.strip().split('\n')
 | 
			
		||||
			# Split the rows into cells
 | 
			
		||||
			cells = [row.split('|') for row in rows]
 | 
			
		||||
			# Remove leading and trailing whitespace from cells
 | 
			
		||||
			cells = [[cell.strip() for cell in row] for row in cells]
 | 
			
		||||
			# Remove empty rows and columns
 | 
			
		||||
			cells = [[cell for cell in row if cell] for row in cells if row]
 | 
			
		||||
			# Create a pandas DataFrame from the cells
 | 
			
		||||
			df = pd.DataFrame(cells[1:], columns=cells[0])
 | 
			
		||||
 | 
			
		||||
			# TODO: Clean and manipulate the df as needed
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
	# call_description()
 | 
			
		||||
	args = call_parser()
 | 
			
		||||
	try:
 | 
			
		||||
		filename = call_website(**vars(args))
 | 
			
		||||
		print(f'File name: {filename}')
 | 
			
		||||
		print(f'Filename: {filename}')
 | 
			
		||||
		call_analyse_pdf(filename)
 | 
			
		||||
		sys.exit(0)
 | 
			
		||||
	except Exception as e:
 | 
			
		||||
		print('Error: {}'.format(e), file=sys.stderr)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user