import re def find_regexp(regexp, filename="slovnik.txt"): with open(filename, "r") as my_file: for line in my_file: if re.search(regexp, line): strip_line = line.rstrip() print(strip_line, end=", ") print("\n") # Vypsat vsechny retezce, ktere obsahuji podretezec 'oo' find_regexp(r'oo') # Vypsat vsechny retezce, ktere zacinaji na 'e' a konci na 'le' # Vypise pouze erteple, elle, emile, find_regexp(r'^e.*le$') # Vypsat vsechny retezce, ktere obsahuji 'a', 'e', 'i', 'o', 'u' v tomto # poradi (ale ne nutne za sebou, napr. akademickou) find_regexp(r'a.*e.*i.*o.*u') # Vypsat vsechny retezce, ktere obsahuji podretezec delky 4 # tvoreny z pismen "rst" (napr. bratrstvi) find_regexp(r'[rst]{4}') # Vypsat vsechny retezce, dve 'u' vzdalena od sebe 8 pozic (napr. 'uhlovodiku') find_regexp(r'u.{8}u') # Obsahuji pismeno 'u' na druhe i predposledni pozici (napr. 'luxus') find_regexp(r'^.u.*u.$') # KromÄ› prvniho a posledniho pismene obsahuji pouze samohlasky a maji presne # 5 pismen (napr. 'foyer') find_regexp(r'^.[aeiouy]{3}.$') def load_data(): with open('sherlock.txt', 'r') as file: dct = {} # load data do dictionary for line in file: # clean text clear_line = cleanString = re.sub('\W+', ' ', line) for word in clear_line.split(' '): if len(word) > 0: dct[word] = dct.get(word, 0) + 1 return dct def top_n(dct, n): i = 0 result = reversed(sorted(dct, key=dct.get)) # creates iterator for i in range(n): el = next(result) # next element form iterator print(str(i + 1) + ".", "\"" + el + "\"", "has", dct[el], "occurrences") def word_average(dct): sum_key_length = 0 for key, value in dct.items(): sum_key_length += (len(key) * value) print("Average length of word is", sum_key_length / sum(dct.values())) d = load_data() top_n(d, 10) print() word_average(d)