ADD .callbook_club file to expand clubstation names
This commit is contained in:
		
							
								
								
									
										180
									
								
								afu/callbook.py
									
									
									
									
									
								
							
							
						
						
									
										180
									
								
								afu/callbook.py
									
									
									
									
									
								
							@@ -28,14 +28,26 @@ def call_parser():
 | 
			
		||||
		Version {__version__}
 | 
			
		||||
		'''
 | 
			
		||||
	)
 | 
			
		||||
	parser.add_argument('--interactive', '-i', action='store_true', default=False)
 | 
			
		||||
	# parser.add_argument('--server', '-s', default=__website__, required=False)
 | 
			
		||||
	parser.add_argument('--version', '-v', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__))
 | 
			
		||||
	parser.add_argument('-i', '--interactive', action='store_true', default=False)
 | 
			
		||||
	# parser.add_argument('-s', '--server' default=__website__, required=False)
 | 
			
		||||
	parser.add_argument('-V', '--version', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__))
 | 
			
		||||
	parser.add_argument('-v', '--verbose', action='append_const', const = 1)
 | 
			
		||||
	parser.add_argument('-p', '--path', default='Rufzeichenliste_AT_Stand_010624.pdf', help= 'skip the download if the specified path to a PDF file exists')
 | 
			
		||||
	parser.add_argument('url', metavar='URL', nargs='?', default=__website__)
 | 
			
		||||
 | 
			
		||||
	return parser.parse_args()
 | 
			
		||||
	opt = parser.parse_args()
 | 
			
		||||
	opt.verbose = 0 if opt.verbose is None else sum(opt.verbose)
 | 
			
		||||
	return opt
 | 
			
		||||
 | 
			
		||||
def call_website(url,verbose,path='',interactive=False):
 | 
			
		||||
 | 
			
		||||
	if path:
 | 
			
		||||
		if os.path.exists(path):
 | 
			
		||||
			return path
 | 
			
		||||
		else:
 | 
			
		||||
			print(f'The given path "{path}" does not exist.')
 | 
			
		||||
			sys.exit(3)
 | 
			
		||||
 | 
			
		||||
def call_website(url,interactive=False):
 | 
			
		||||
	if(interactive):
 | 
			
		||||
		print('Interactive')
 | 
			
		||||
		driver=webdriver.Chrome()
 | 
			
		||||
@@ -71,61 +83,139 @@ def call_website(url,interactive=False):
 | 
			
		||||
	driver.close()
 | 
			
		||||
	return os.path.basename(href)
 | 
			
		||||
 | 
			
		||||
def get_pdf_content_lines(pdf_file_path):
 | 
			
		||||
	with open(pdf_file_path) as f:
 | 
			
		||||
		pdf_reader = PdfReader(f)
 | 
			
		||||
		for page in pdf_reader.pages:
 | 
			
		||||
			for line in page.extractText().spitlines():
 | 
			
		||||
				yield line
 | 
			
		||||
def remove_first_quote_if_odd(text, verbose = 0):
 | 
			
		||||
	double_quote_cnt = text.count('"')
 | 
			
		||||
	# single_quote_cnt = text.count("'")
 | 
			
		||||
 | 
			
		||||
def call_analyse_pdf(file):
 | 
			
		||||
	if (double_quote_cnt % 2 != 0): # or (single_quote_cnt % 2 != 0):
 | 
			
		||||
		# Find and remove the first quote
 | 
			
		||||
		for i, char in enumerate(text):
 | 
			
		||||
				if char in ['"']: # ['"', "'"]:
 | 
			
		||||
					if (verbose > 0):
 | 
			
		||||
						print(text)
 | 
			
		||||
					text = text[:i] + text[i+1:]
 | 
			
		||||
					if (verbose > 0):
 | 
			
		||||
						print(text)
 | 
			
		||||
					break
 | 
			
		||||
	return text
 | 
			
		||||
 | 
			
		||||
def is_clubstation(call):
 | 
			
		||||
	assert(len(call) > 3)
 | 
			
		||||
	if call[3].upper() == 'X':
 | 
			
		||||
		return True
 | 
			
		||||
	return False
 | 
			
		||||
 | 
			
		||||
def replace_substring_with_line(path, search_substring, verbose=0):
 | 
			
		||||
	
 | 
			
		||||
	try:
 | 
			
		||||
		with open(path, 'r') as file:
 | 
			
		||||
			lines = file.readlines()
 | 
			
		||||
		
 | 
			
		||||
		search_substring
 | 
			
		||||
		for line in lines:
 | 
			
		||||
			if search_substring[0:40].lower() in line.lower():
 | 
			
		||||
				modified_line = line.strip()
 | 
			
		||||
				# Replace the substring with the whole line
 | 
			
		||||
				## line = line.lower().replace(search_substring.lower(), modified_line)
 | 
			
		||||
				modified_line = line
 | 
			
		||||
				return modified_line
 | 
			
		||||
 | 
			
		||||
	except FileNotFoundError:
 | 
			
		||||
		print(f'The file {path} was not found.')
 | 
			
		||||
	except Exception as e:
 | 
			
		||||
		print(f'An error occurred: {e}')
 | 
			
		||||
	
 | 
			
		||||
	return search_substring
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def call_data_record(line, mod_date, verbose):
 | 
			
		||||
 | 
			
		||||
	# we have to split the record with a cost-intensive regular expression
 | 
			
		||||
	# record = re.split('OE[0-9][A-Z]{1,3}[ \t]{3,20}',line) # this does not work 100%
 | 
			
		||||
	# record = re.findall(r'(OE[0-9][A-Z]{1,3})[ \t]{2,12}([A-ZÄÖÜ].+[ ]?.*[ ]?.*[ ]?.*)[ \t]{3,30}(.{3,30})[ \t]{3,30}([1,3,4])', line)
 | 
			
		||||
	# record = re.search(r'(OE[0-9][A-Z]{1,3})[ ]{2,12}([. ]+)[ ]{3,50}([. ]+)[ ]{3,50}([1-4])', line)
 | 
			
		||||
	# record = re.search(r'^(OE[0-9][A-Z]{1,3})[ \t]{2,20}([\w ]{1,12})[ ]{3,50}(.*)([1-4]{1})$', line)
 | 
			
		||||
 | 
			
		||||
	# Never split Addresses containing 2 or 3 spaces, also several records contain no address or no location
 | 
			
		||||
	records = re.split(r'[ ]{4,65}', line)
 | 
			
		||||
	# [records for record in records]
 | 
			
		||||
 | 
			
		||||
	if verbose > 2 :
 | 
			
		||||
		print(f'Record length: {len(records)}')
 | 
			
		||||
 | 
			
		||||
		for m in records:
 | 
			
		||||
			print(m)
 | 
			
		||||
 | 
			
		||||
	# HARDENING:
 | 
			
		||||
	assert(len(records) == 5)
 | 
			
		||||
	# OE Call:
 | 
			
		||||
	call = records[0]
 | 
			
		||||
	match = re.search(r'^(OE[0-9][A-Z]{1,3})', call)
 | 
			
		||||
	assert(match.string == call)
 | 
			
		||||
	fullname = records[1]
 | 
			
		||||
	# If there is a clubstation
 | 
			
		||||
	if is_clubstation(call):
 | 
			
		||||
		# Name starting with only one quotation marks e.g. " -- remove that one:
 | 
			
		||||
		fullname = remove_first_quote_if_odd(fullname, verbose) # only found @ clubstations
 | 
			
		||||
		clubstationfile = '.callbook_club'
 | 
			
		||||
		if verbose > 0:
 | 
			
		||||
			print(f'Call: {call}, Name: {fullname}')
 | 
			
		||||
		path = os.path.join(os.path.dirname(os.path.abspath(__file__)), clubstationfile)
 | 
			
		||||
		if os.path.exists(path):
 | 
			
		||||
			fullname = replace_substring_with_line(path, fullname, verbose)
 | 
			
		||||
		if verbose > 0:
 | 
			
		||||
			print(f'Call: {call}, Name: {fullname}')
 | 
			
		||||
			
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
#	if not record:
 | 
			
		||||
#		return
 | 
			
		||||
#	if verbose == 1:
 | 
			
		||||
#		print(record.group(1))
 | 
			
		||||
#	if verbose >= 3:
 | 
			
		||||
#		print(f'Call: {record.group(1)}')
 | 
			
		||||
#		print(f'Name: {record.group(2)}')
 | 
			
		||||
#		#print(f'Location: {record[3]}')
 | 
			
		||||
#		#print(f'Address: {record[4]}')
 | 
			
		||||
#		#print(f'Permit Class: {record[5]}')
 | 
			
		||||
 | 
			
		||||
def call_analyse_pdf(file, verbose):
 | 
			
		||||
 | 
			
		||||
	# Define a regular expression to match tables
 | 
			
		||||
 | 
			
		||||
	reader = PdfReader(file)
 | 
			
		||||
	meta = reader.metadata
 | 
			
		||||
	print('   Pages:', len(reader.pages))
 | 
			
		||||
	# All of the following could be None!
 | 
			
		||||
	print(f'  Author: {meta.author}')
 | 
			
		||||
	print(f' Creator: {meta.creator}')
 | 
			
		||||
	print(f'Producer: {meta.producer}')
 | 
			
		||||
	print(f' Subject: {meta.subject}')
 | 
			
		||||
	print(f'   Title: {meta.title}')
 | 
			
		||||
	print(f' Created: {meta.creation_date}')
 | 
			
		||||
	print(f'Modified: {meta.modification_date}')
 | 
			
		||||
	if verbose:
 | 
			
		||||
		print(verbose)
 | 
			
		||||
		print('   Pages:', len(reader.pages))	
 | 
			
		||||
		# All of the following could be None!
 | 
			
		||||
		print(f'  Author: {meta.author}')
 | 
			
		||||
		print(f' Creator: {meta.creator}')
 | 
			
		||||
		print(f'Producer: {meta.producer}')
 | 
			
		||||
		print(f' Subject: {meta.subject}')
 | 
			
		||||
		print(f'   Title: {meta.title}')
 | 
			
		||||
		print(f' Created: {meta.creation_date}')
 | 
			
		||||
		print(f'Modified: {meta.modification_date}')
 | 
			
		||||
		
 | 
			
		||||
	for page in reader.pages:
 | 
			
		||||
		page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
 | 
			
		||||
		#print(page_text)
 | 
			
		||||
		# page_text = page.extract_text(extraction_mode="layout", layout_mode_scale_weight=1.0)
 | 
			
		||||
		# print(page_text)
 | 
			
		||||
		# Find all tables in page_text
 | 
			
		||||
		calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
 | 
			
		||||
		for call in calls:
 | 
			
		||||
			print(call)
 | 
			
		||||
		tables = re.findall(r'(?s)\b(?:\w+\s+){2,}\w+\b(?:\s*[,;]\s*\b(?:\w+\s+){2,}\w+\b)*', page_text)
 | 
			
		||||
		# Loop through each table and create a pandas DataFrame
 | 
			
		||||
		for table in tables:
 | 
			
		||||
			# Split the table into rows
 | 
			
		||||
			rows = table.strip().split('\n')
 | 
			
		||||
			# Split the rows into cells
 | 
			
		||||
			cells = [row.split('|') for row in rows]
 | 
			
		||||
			# Remove leading and trailing whitespace from cells
 | 
			
		||||
			cells = [[cell.strip() for cell in row] for row in cells]
 | 
			
		||||
			# Remove empty rows and columns
 | 
			
		||||
			cells = [[cell for cell in row if cell] for row in cells if row]
 | 
			
		||||
			# Create a pandas DataFrame from the cells
 | 
			
		||||
			df = pd.DataFrame(cells[1:], columns=cells[0])
 | 
			
		||||
 | 
			
		||||
			# TODO: Clean and manipulate the df as needed
 | 
			
		||||
 | 
			
		||||
		lines = page_text.strip().splitlines()
 | 
			
		||||
		for line in lines[3:-2]:
 | 
			
		||||
			line = line.strip()
 | 
			
		||||
			# calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
 | 
			
		||||
			if verbose >= 2:
 | 
			
		||||
				print(line)
 | 
			
		||||
			call_data_record(line, meta.modification_date,verbose)
 | 
			
		||||
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
	# call_description()
 | 
			
		||||
	args = call_parser()
 | 
			
		||||
	# filename = 'Rufzeichenliste_AT_Stand_010624.pdf'
 | 
			
		||||
	try:
 | 
			
		||||
		filename = call_website(**vars(args))
 | 
			
		||||
 | 
			
		||||
		print(f'Filename: {filename}')
 | 
			
		||||
		call_analyse_pdf(filename)
 | 
			
		||||
		call_analyse_pdf(filename,args.verbose)
 | 
			
		||||
		sys.exit(0)
 | 
			
		||||
	except Exception as e:
 | 
			
		||||
		print('Error: {}'.format(e), file=sys.stderr)
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user