ADD typo fixing file
This commit is contained in:
		
							
								
								
									
										214
									
								
								afu/callbook.py
									
									
									
									
									
								
							
							
						
						
									
										214
									
								
								afu/callbook.py
									
									
									
									
									
								
							@@ -33,13 +33,16 @@ def call_parser():
 | 
			
		||||
	parser.add_argument('-V', '--version', action='version', version='{} {}'.format(os.path.split(__file__)[1],__version__))
 | 
			
		||||
	parser.add_argument('-v', '--verbose', action='append_const', const = 1)
 | 
			
		||||
	parser.add_argument('-p', '--path', default='Rufzeichenliste_AT_Stand_010624.pdf', help= 'skip the download if the specified path to a PDF file exists')
 | 
			
		||||
	# parser.add_argument('-t', '--type', default='' , help='specify the output, supported types are [ CSV | JSON ]') # not implemented yet
 | 
			
		||||
	parser.add_argument('-o', '--output', default='', help='specify the file where the data are written to, default stdout')
 | 
			
		||||
	parser.add_argument('-m', '--mariadb', help='SQL interface to MariaDB (MySql) format "<IP-Address>:<Port> <User> <Passwd>" or defined in .config')
 | 
			
		||||
	parser.add_argument('url', metavar='URL', nargs='?', default=__website__)
 | 
			
		||||
 | 
			
		||||
	opt = parser.parse_args()
 | 
			
		||||
	opt.verbose = 0 if opt.verbose is None else sum(opt.verbose)
 | 
			
		||||
	return opt
 | 
			
		||||
 | 
			
		||||
def call_website(url,verbose,path='',interactive=False):
 | 
			
		||||
def call_website(url,verbose,path='',interactive=False,output='',mariadb=''):
 | 
			
		||||
 | 
			
		||||
	if path:
 | 
			
		||||
		if os.path.exists(path):
 | 
			
		||||
@@ -78,7 +81,7 @@ def call_website(url,verbose,path='',interactive=False):
 | 
			
		||||
	if(interactive):
 | 
			
		||||
		time.sleep(300)
 | 
			
		||||
	else:
 | 
			
		||||
		time.sleep(5)
 | 
			
		||||
		time.sleep(4)
 | 
			
		||||
 | 
			
		||||
	driver.close()
 | 
			
		||||
	return os.path.basename(href)
 | 
			
		||||
@@ -91,29 +94,31 @@ def remove_first_quote_if_odd(text, verbose = 0):
 | 
			
		||||
		# Find and remove the first quote
 | 
			
		||||
		for i, char in enumerate(text):
 | 
			
		||||
				if char in ['"']: # ['"', "'"]:
 | 
			
		||||
					if (verbose > 0):
 | 
			
		||||
					if (verbose > 1):
 | 
			
		||||
						print(text)
 | 
			
		||||
					text = text[:i] + text[i+1:]
 | 
			
		||||
					if (verbose > 0):
 | 
			
		||||
					if (verbose > 1):
 | 
			
		||||
						print(text)
 | 
			
		||||
					break
 | 
			
		||||
	return text
 | 
			
		||||
 | 
			
		||||
def is_clubstation(call):
 | 
			
		||||
	assert(len(call) > 3)
 | 
			
		||||
	if call[3].upper() == 'X':
 | 
			
		||||
	if call[3].upper() == 'X' or call.upper() == 'OE5SIX': # special case with OE5SIX (Clubstation)
 | 
			
		||||
		return True
 | 
			
		||||
	
 | 
			
		||||
	return False
 | 
			
		||||
 | 
			
		||||
 | 
			
		||||
def replace_substring_with_line(path, search_substring, verbose=0):
 | 
			
		||||
	
 | 
			
		||||
	try:
 | 
			
		||||
		with open(path, 'r') as file:
 | 
			
		||||
			lines = file.readlines()
 | 
			
		||||
		
 | 
			
		||||
		search_substring
 | 
			
		||||
		for line in lines:
 | 
			
		||||
			if search_substring[0:40].lower() in line.lower():
 | 
			
		||||
		if not replace_substring_with_line.lines:
 | 
			
		||||
			with open(path, 'r') as file:
 | 
			
		||||
				replace_substring_with_line.lines = file.readlines()
 | 
			
		||||
 | 
			
		||||
		for line in replace_substring_with_line.lines:
 | 
			
		||||
			if search_substring[0:46].lower() in line.lower():
 | 
			
		||||
				modified_line = line.strip()
 | 
			
		||||
				# Replace the substring with the whole line
 | 
			
		||||
				## line = line.lower().replace(search_substring.lower(), modified_line)
 | 
			
		||||
@@ -127,6 +132,153 @@ def replace_substring_with_line(path, search_substring, verbose=0):
 | 
			
		||||
	
 | 
			
		||||
	return search_substring
 | 
			
		||||
 | 
			
		||||
replace_substring_with_line.lines = None
 | 
			
		||||
 | 
			
		||||
def gender_substring(path, search_substring, verbose=0):
 | 
			
		||||
	try:
 | 
			
		||||
		if not gender_substring.lines:
 | 
			
		||||
			with open(path, 'r') as file:
 | 
			
		||||
				gender_substring.lines = file.readlines()
 | 
			
		||||
 | 
			
		||||
		for line in gender_substring.lines:
 | 
			
		||||
			if search_substring[2:].lower() in line.lower():
 | 
			
		||||
				return line[0]
 | 
			
		||||
	except FileNotFoundError:
 | 
			
		||||
		print(f'The file {path} was not found.')
 | 
			
		||||
	except Exception as e:
 | 
			
		||||
		print(f'An error occurred: {e}')
 | 
			
		||||
 | 
			
		||||
	return 'x' # not found, unknown gender
 | 
			
		||||
 | 
			
		||||
gender_substring.lines = None
 | 
			
		||||
 | 
			
		||||
def get_gender(firstnames, surname, call, verbose=0):
 | 
			
		||||
 | 
			
		||||
	# load the .gender file:
 | 
			
		||||
	genderfile = '.gender'
 | 
			
		||||
	gender = 'x'
 | 
			
		||||
	gpath = os.path.join(os.path.dirname(os.path.abspath(__file__)), genderfile)
 | 
			
		||||
	if os.path.exists(gpath):
 | 
			
		||||
		firstname = firstnames.split(' ', 1)[0]
 | 
			
		||||
		gender = gender_substring(gpath, firstname, verbose=0) # only check 1st/firstname of name, important when there are more than 1 firstnames 
 | 
			
		||||
		if gender == 'x':
 | 
			
		||||
			if verbose > 0:
 | 
			
		||||
				get_gender.cnt += 1
 | 
			
		||||
				print(f'({get_gender.cnt}){call} "{firstname}" [{firstnames} {surname}] not found in file {genderfile} - gender "x" is set.')
 | 
			
		||||
	return gender
 | 
			
		||||
 | 
			
		||||
get_gender.cnt = 0
 | 
			
		||||
 | 
			
		||||
def call_split_name(fullname, call, verbose):
 | 
			
		||||
 | 
			
		||||
	assert(len(fullname) > 1)
 | 
			
		||||
 | 
			
		||||
	name = fullname.split(' ', 1)
 | 
			
		||||
	surname = name[0]
 | 
			
		||||
	# several special cases like surname "de Lijezer", "van Dijk", "el Shamaa", etc.
 | 
			
		||||
	match surname.lower():
 | 
			
		||||
		case 'de' | 'el':
 | 
			
		||||
			name = fullname[3:].split(' ',1)
 | 
			
		||||
			surname = surname.lower() + ' ' + name[0]
 | 
			
		||||
			if verbose > 0:
 | 
			
		||||
				print(f'## {fullname} --> {surname} ##')
 | 
			
		||||
 | 
			
		||||
		case 'van' | 'von' :
 | 
			
		||||
			name = fullname[4:].split(' ',1)
 | 
			
		||||
			surname = surname.lower() + ' ' + name[0]
 | 
			
		||||
			if surname.lower() in ['van der', 'von der', 'van den']:	# e.g. "van der Meulen", "Walther von der Vogelweide", "Annie van den Berg"
 | 
			
		||||
				name = fullname[8:].split(' ',1)
 | 
			
		||||
				surname = surname.lower() + ' ' + name[0]
 | 
			
		||||
			if verbose > 0:
 | 
			
		||||
				print(f'## {fullname} --> {surname} ##')
 | 
			
		||||
		case 'della' : # Ancient Italian noble family "della Rowere"
 | 
			
		||||
			name = fullname[6:].split(' ',1)
 | 
			
		||||
			surname = surname.lower() + ' ' + name[0]
 | 
			
		||||
			if verbose > 0:
 | 
			
		||||
				print(f'## {fullname} --> {surname} ##')
 | 
			
		||||
 | 
			
		||||
	if len(name) > 1:
 | 
			
		||||
		firstname = name[1]
 | 
			
		||||
	else:
 | 
			
		||||
		firstname = '<unknown>'
 | 
			
		||||
 | 
			
		||||
	# In Austria the call suffix starting with Y is an YL (young lady)
 | 
			
		||||
#	if call[3].upper() == 'Y':
 | 
			
		||||
	if False:
 | 
			
		||||
		gender = 'f'
 | 
			
		||||
	else:
 | 
			
		||||
		gender = get_gender(firstname, surname, call, verbose)
 | 
			
		||||
 | 
			
		||||
	return firstname, surname, gender
 | 
			
		||||
 | 
			
		||||
def fix_typo(call, fullname, verbose=1):
 | 
			
		||||
	fixtypofile = '.typo_callbook'
 | 
			
		||||
	path = os.path.join(os.path.dirname(os.path.abspath(__file__)), fixtypofile)
 | 
			
		||||
	if os.path.exists(path):
 | 
			
		||||
		try:
 | 
			
		||||
			if not fix_typo.lines:
 | 
			
		||||
				with open(path, 'r') as file:
 | 
			
		||||
					fix_typo.lines = file.readlines()
 | 
			
		||||
 | 
			
		||||
			for line in fix_typo.lines[4:]: # starting with line 4
 | 
			
		||||
				if len(fix_typo.spaces) == 0: # not initialized
 | 
			
		||||
					if line[0] == '*':
 | 
			
		||||
						words = line.split()
 | 
			
		||||
						assert len(words) == 4 # i.e. '*, call, nachname, vorname'
 | 
			
		||||
						fix_typo.spaces = [line.index(words[1]), line.index(words[2]), line.index(words[3])]
 | 
			
		||||
				else:
 | 
			
		||||
					if call in line[2:8]:
 | 
			
		||||
						print(f'Call: {call} found')
 | 
			
		||||
						match line[0]:
 | 
			
		||||
							case '#':
 | 
			
		||||
								if verbose > 1:
 | 
			
		||||
									print(line.rstrip())
 | 
			
		||||
							case 'F':
 | 
			
		||||
								if verbose > 0:
 | 
			
		||||
									print(fullname)
 | 
			
		||||
									print(line.rstrip())
 | 
			
		||||
								firstname1, surname1, gender1 = call_split_name(fullname, call, 0)
 | 
			
		||||
								fullname2 = line[fix_typo.spaces[1]:fix_typo.spaces[2]-1].rstrip() + ' ' + line[fix_typo.spaces[2]:-1]
 | 
			
		||||
								if verbose > 0:
 | 
			
		||||
									print(fullname2)
 | 
			
		||||
								firstname2, surname2, gender2 = call_split_name(fullname2, call, 0)
 | 
			
		||||
								# Hardening: at a minimum, either the firstnames or the surenames must fit
 | 
			
		||||
								fix_cnt = 0
 | 
			
		||||
								if (firstname1 != firstname2):
 | 
			
		||||
									fix_cnt += 1
 | 
			
		||||
								if (surname1 != surname2):
 | 
			
		||||
									fix_cnt += 1
 | 
			
		||||
								if fix_cnt == 0:
 | 
			
		||||
									print(f'It is fixed! You can remove the line with the item {call} from the file {fixtypofile}!')
 | 
			
		||||
								elif fix_cnt > 1:
 | 
			
		||||
									print(f'Something went wrong, there are several bugs. Check line with call {call} in file {fixtypofile}!')
 | 
			
		||||
								else:
 | 
			
		||||
									fullname = fullname2
 | 
			
		||||
							case 'X': # exchange the surname with firstname
 | 
			
		||||
								if verbose > 0:
 | 
			
		||||
									print(fullname)
 | 
			
		||||
									print(line.rstrip())
 | 
			
		||||
								firstname1, surname1, gender1 = call_split_name(fullname, call, 0)
 | 
			
		||||
								fullname2 = line[fix_typo.spaces[1]:fix_typo.spaces[2]-1].rstrip() + ' ' + line[fix_typo.spaces[2]:-1]
 | 
			
		||||
								if verbose > 0:
 | 
			
		||||
									print(fullname2)
 | 
			
		||||
								firstname2, surname2, gender2 = call_split_name(fullname2, call, 0)
 | 
			
		||||
								fix_cnt = 0
 | 
			
		||||
								if (firstname1 ==  firstname2) and (surname1 == surname2):
 | 
			
		||||
									print(f'It is fixed! You can remove the line with the item {call} from the file {fixtypofile}!')
 | 
			
		||||
								elif (firstname1 !=  surname2) or (surname1 != firstname2):
 | 
			
		||||
									print(f'Something went wrong, there are several bugs. Check line with call {call} in file {fixtypofile}!')
 | 
			
		||||
								else:
 | 
			
		||||
									fullname = fullname2
 | 
			
		||||
 | 
			
		||||
		except FileNotFoundError:
 | 
			
		||||
			print(f'The file {path} was not found.')
 | 
			
		||||
		except Exception as e:
 | 
			
		||||
			print(f'An error occurred: {e}')
 | 
			
		||||
 | 
			
		||||
	return fullname
 | 
			
		||||
fix_typo.lines = None
 | 
			
		||||
fix_typo.spaces = []
 | 
			
		||||
 | 
			
		||||
def call_data_record(line, mod_date, verbose):
 | 
			
		||||
 | 
			
		||||
@@ -153,31 +305,33 @@ def call_data_record(line, mod_date, verbose):
 | 
			
		||||
	match = re.search(r'^(OE[0-9][A-Z]{1,3})', call)
 | 
			
		||||
	assert(match.string == call)
 | 
			
		||||
	fullname = records[1]
 | 
			
		||||
	location = records[2]
 | 
			
		||||
	address  = records[3]
 | 
			
		||||
	permit_class = records[4]
 | 
			
		||||
	fullname = fix_typo(call, fullname, verbose)
 | 
			
		||||
	# If there is a clubstation
 | 
			
		||||
	if is_clubstation(call):
 | 
			
		||||
		# Name starting with only one quotation marks e.g. " -- remove that one:
 | 
			
		||||
		fullname = remove_first_quote_if_odd(fullname, verbose) # only found @ clubstations
 | 
			
		||||
		clubstationfile = '.callbook_club'
 | 
			
		||||
		if verbose > 0:
 | 
			
		||||
		if verbose > 1:
 | 
			
		||||
			print(f'Call: {call}, Name: {fullname}')
 | 
			
		||||
		path = os.path.join(os.path.dirname(os.path.abspath(__file__)), clubstationfile)
 | 
			
		||||
		if os.path.exists(path):
 | 
			
		||||
			fullname = replace_substring_with_line(path, fullname, verbose)
 | 
			
		||||
		if verbose > 0:
 | 
			
		||||
			print(f'Call: {call}, Name: {fullname}')
 | 
			
		||||
			
 | 
			
		||||
		gender = '*'
 | 
			
		||||
	elif fullname[0] == '*':
 | 
			
		||||
		gender = '*'
 | 
			
		||||
	else: # Try to split the YL or OMs Name, guess the gender
 | 
			
		||||
		firstname, surname, gender = call_split_name(fullname, call, verbose)
 | 
			
		||||
	if verbose > 1:
 | 
			
		||||
		if gender == '*':
 | 
			
		||||
			print(f'Call: {call}, Name: {fullname}, Gender: {gender}')
 | 
			
		||||
		else:
 | 
			
		||||
			print(f'Call: {call}, First Name: {firstname}, Surname: {surname}, Gender: {gender}')
 | 
			
		||||
 | 
			
		||||
		print(f'Location: {location}, Address: {address}, Permit: {permit_class}')
 | 
			
		||||
 | 
			
		||||
#	if not record:
 | 
			
		||||
#		return
 | 
			
		||||
#	if verbose == 1:
 | 
			
		||||
#		print(record.group(1))
 | 
			
		||||
#	if verbose >= 3:
 | 
			
		||||
#		print(f'Call: {record.group(1)}')
 | 
			
		||||
#		print(f'Name: {record.group(2)}')
 | 
			
		||||
#		#print(f'Location: {record[3]}')
 | 
			
		||||
#		#print(f'Address: {record[4]}')
 | 
			
		||||
#		#print(f'Permit Class: {record[5]}')
 | 
			
		||||
 | 
			
		||||
def call_analyse_pdf(file, verbose):
 | 
			
		||||
 | 
			
		||||
@@ -196,13 +350,13 @@ def call_analyse_pdf(file, verbose):
 | 
			
		||||
		print(f'   Title: {meta.title}')
 | 
			
		||||
		print(f' Created: {meta.creation_date}')
 | 
			
		||||
		print(f'Modified: {meta.modification_date}')
 | 
			
		||||
		
 | 
			
		||||
 | 
			
		||||
	for page in reader.pages:
 | 
			
		||||
		page_text = page.extract_text(extraction_mode="layout", layout_mode_space_vertically=False)
 | 
			
		||||
		lines = page_text.strip().splitlines()
 | 
			
		||||
		for line in lines[3:-2]:
 | 
			
		||||
			line = line.strip()
 | 
			
		||||
			# calls = re.findall(r' +(OE[0-9][A-Z]{1,3}).*$', page_text)
 | 
			
		||||
 | 
			
		||||
			if verbose >= 2:
 | 
			
		||||
				print(line)
 | 
			
		||||
			call_data_record(line, meta.modification_date,verbose)
 | 
			
		||||
@@ -210,11 +364,11 @@ def call_analyse_pdf(file, verbose):
 | 
			
		||||
if __name__ == '__main__':
 | 
			
		||||
	# call_description()
 | 
			
		||||
	args = call_parser()
 | 
			
		||||
	# filename = 'Rufzeichenliste_AT_Stand_010624.pdf'
 | 
			
		||||
 | 
			
		||||
	try:
 | 
			
		||||
		filename = call_website(**vars(args))
 | 
			
		||||
 | 
			
		||||
		print(f'Filename: {filename}')
 | 
			
		||||
		if args.verbose > 1:
 | 
			
		||||
			print(f'Filename: {filename}')
 | 
			
		||||
		call_analyse_pdf(filename,args.verbose)
 | 
			
		||||
		sys.exit(0)
 | 
			
		||||
	except Exception as e:
 | 
			
		||||
 
 | 
			
		||||
		Reference in New Issue
	
	Block a user