ADD .callbook_club file to expand clubstation names

2024-06-16 17:00:46 +02:00
parent 23bd746108
commit ad1a7c4507
3 changed files with 158 additions and 45 deletions
--- a/afu/callbook.py
+++ b/afu/callbook.py
@@ -28,14 +28,26 @@ def call_parser():
 		Version {__version__}
 		'''
 	)
-	parser.add_argument('--interactive', '-i', action='store_true', default=False)
-	# parser.add_argument('--server', '-s', default=__website__, required=False)
-	parser.add_argument('--version', '-v', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__))
+	parser.add_argument('-i', '--interactive', action='store_true', default=False)
+	# parser.add_argument('-s', '--server' default=__website__, required=False)
+	parser.add_argument('-V', '--version', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__))
+	parser.add_argument('-v', '--verbose', action='append_const', const = 1)
+	parser.add_argument('-p', '--path', default='Rufzeichenliste_AT_Stand_010624.pdf', help= 'skip the download if the specified path to a PDF file exists')
 	parser.add_argument('url', metavar='URL', nargs='?', default=__website__)

-	return parser.parse_args()
+	opt = parser.parse_args()
+	opt.verbose = 0 if opt.verbose is None else sum(opt.verbose)
+	return opt
+
+def call_website(url,verbose,path='',interactive=False):
+
+	if path:
+		if os.path.exists(path):
+			return path
+		else:
+			print(f'The given path "{path}" does not exist.')
+			sys.exit(3)

-def call_website(url,interactive=False):
 	if(interactive):
 		print('Interactive')
 		driver=webdriver.Chrome()
@@ -71,61 +83,139 @@ def call_website(url,interactive=False):
 	driver.close()
 	return os.path.basename(href)

-def get_pdf_content_lines(pdf_file_path):
-	with open(pdf_file_path) as f:
-		pdf_reader = PdfReader(f)
-		for page in pdf_reader.pages:
-			for line in page.extractText().spitlines():
-				yield line
+def remove_first_quote_if_odd(text, verbose = 0):
+	double_quote_cnt = text.count('"')
+	# single_quote_cnt = text.count("'")

-def call_analyse_pdf(file):
+	if (double_quote_cnt % 2 != 0): # or (single_quote_cnt % 2 != 0):
+		# Find and remove the first quote
+		for i, char in enumerate(text):
+				if char in ['"']: # ['"', "'"]:
+					if (verbose > 0):
+						print(text)
+					text = text[:i] + text[i+1:]
+					if (verbose > 0):
+						print(text)
+					break
+	return text
+
+def is_clubstation(call):
+	assert(len(call) > 3)
+	if call[3].upper() == 'X':
+		return True
+	return False
+
+def replace_substring_with_line(path, search_substring, verbose=0):
+	
+	try:
+		with open(path, 'r') as file:
+			lines = file.readlines()
+		
+		search_substring
+		for line in lines:
+			if search_substring[0:40].lower() in line.lower():
+				modified_line = line.strip()
+				# Replace the substring with the whole line
+				## line = line.lower().replace(search_substring.lower(), modified_line)
+				modified_line = line
+				return modified_line
+
+	except FileNotFoundError:
+		print(f'The file {path} was not found.')
+	except Exception as e:
+		print(f'An error occurred: {e}')
+	
+	return search_substring
+
+
+def call_data_record(line, mod_date, verbose):
+
+	# we have to split the record with a cost-intensive regular expression
+	# record = re.split('OE[0-9][A-Z]{1,3}[ \t]{3,20}',line) # this does not work 100%
+	# record = re.findall(r'(OE[0-9][A-Z]{1,3})[ \t]{2,12}([A-ZÄÖÜ].+[ ]?.*[ ]?.*[ ]?.*)[ \t]{3,30}(.{3,30})[ \t]{3,30}([1,3,4])', line)
+	# record = re.search(r'(OE[0-9][A-Z]{1,3})[ ]{2,12}([. ]+)[ ]{3,50}([. ]+)[ ]{3,50}([1-4])', line)
+	# record = re.search(r'^(OE[0-9][A-Z]{1,3})[ \t]{2,20}([\w ]{1,12})[ ]{3,50}(.*)([1-4]{1})$', line)
+
+	# Never split Addresses containing 2 or 3 spaces, also several records contain no address or no location
+	records = re.split(r'[ ]{4,65}', line)
+	# [records for record in records]
+
+	if verbose > 2 :
+		print(f'Record length: {len(records)}')
+
+		for m in records:
+			print(m)
+
+	# HARDENING:
+	assert(len(records) == 5)
+	# OE Call:
+	call = records[0]
+	match = re.search(r'^(OE[0-9][A-Z]{1,3})', call)
+	assert(match.string == call)
+	fullname = records[1]
+	# If there is a clubstation
+	if is_clubstation(call):
+		# Name starting with only one quotation marks e.g. " -- remove that one:
+		fullname = remove_first_quote_if_odd(fullname, verbose) # only found @ clubstations
+		clubstationfile = '.callbook_club'
+		if verbose > 0:
+			print(f'Call: {call}, Name: {fullname}')
+		path = os.path.join(os.path.dirname(os.path.abspath(__file__)), clubstationfile)
+		if os.path.exists(path):
+			fullname = replace_substring_with_line(path, fullname, verbose)
+		if verbose > 0:
+			print(f'Call: {call}, Name: {fullname}')
+			
+
+
+#	if not record:
+#		return
+#	if verbose == 1:
+#		print(record.group(1))
+#	if verbose >= 3:
+#		print(f'Call: {record.group(1)}')
+#		print(f'Name: {record.group(2)}')
+#		#print(f'Location: {record[3]}')
+#		#print(f'Address: {record[4]}')
+#		#print(f'Permit Class: {record[5]}')
+
+def call_analyse_pdf(file, verbose):

 	# Define a regular expression to match tables

 	reader = PdfReader(file)
 	meta = reader.metadata
-	print('   Pages:', len(reader.pages))
-	# All of the following could be None!
-	print(f'  Author: {meta.author}')
-	print(f' Creator: {meta.creator}')
-	print(f'Producer: {meta.producer}')
-	print(f' Subject: {meta.subject}')
-	print(f'   Title: {meta.title}')
-	print(f' Created: {meta.creation_date}')
-	print(f'Modified: {meta.modification_date}')
+	if verbose:
+		print(verbose)
+		print('   Pages:', len(reader.pages))	
+		# All of the following could be None!
+		print(f'  Author: {meta.author}')
+		print(f' Creator: {meta.creator}')
+		print(f'Producer: {meta.producer}')
+		print(f' Subject: {meta.subject}')
+		print(f'   Title: {meta.title}')
+		print(f' Created: {meta.creation_date}')
+		print(f'Modified: {meta.modification_date}')
+		
 	for page in reader.pages:
 		page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
-		#print(page_text)
-		# page_text = page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0)
-		# print(page_text)
-		# Find all tables in page_text
-		calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
-		for call in calls:
-			print(call)
-		tables = re.findall(r'(?s)\b(?:\w+\s+){2,}\w+\b(?:\s*[,;]\s*\b(?:\w+\s+){2,}\w+\b)*', page_text)
-		# Loop through each table and create a pandas DataFrame
-		for table in tables:
-			# Split the table into rows
-			rows = table.strip().split('\n')
-			# Split the rows into cells
-			cells = [row.split('|') for row in rows]
-			# Remove leading and trailing whitespace from cells
-			cells = [[cell.strip() for cell in row] for row in cells]
-			# Remove empty rows and columns
-			cells = [[cell for cell in row if cell] for row in cells if row]
-			# Create a pandas DataFrame from the cells
-			df = pd.DataFrame(cells[1:], columns=cells[0])
-
-			# TODO: Clean and manipulate the df as needed
-
+		lines = page_text.strip().splitlines()
+		for line in lines[3:-2]:
+			line = line.strip()
+			# calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
+			if verbose >= 2:
+				print(line)
+			call_data_record(line, meta.modification_date,verbose)

 if __name__ == '__main__':
 	# call_description()
 	args = call_parser()
+	# filename = 'Rufzeichenliste_AT_Stand_010624.pdf'
 	try:
 		filename = call_website(**vars(args))
+
 		print(f'Filename: {filename}')
-		call_analyse_pdf(filename)
+		call_analyse_pdf(filename,args.verbose)
 		sys.exit(0)
 	except Exception as e:
 		print('Error: {}'.format(e), file=sys.stderr)