from random import randint inFile = open("sherlock-holmes.txt", "r") wLens = {} lFreq = {} lCorel = {} lFreqCheck = {} def generateLetter(prev = " "): d = lCorel[prev] #print(d) #print("suma: " + str(sum(d.values()))) r = randint(0, sum(d.values())) #print(r) for i in d.items(): r -= i[1] if r <= 0: return i[0] def generateWordLen(): r = randint(0, sum(wLens.values())) #print(r) for i in wLens.items(): r -= i[1] if r <= 0: return i[0] def generateWords(wordCount): result = " " for i in range(wordCount): wordLen = generateWordLen() for j in range(wordLen): result += generateLetter(result[-1]) result += " " return result lines = inFile.readlines(10000) for l in lines: tmp = l.strip().replace('"', "").replace("'", "").replace('-', "").replace('.', "").replace('?', "").replace('!', "").replace(';', "").replace(',', "") if tmp != "": tmp = tmp.lower() wList = tmp.split(" ") for w in wList: if len(w) not in wLens.keys(): wLens[len(w)] = 1 else: wLens[len(w)] += 1 for i in range(len(w)): if w[i] not in lFreq.keys(): lFreq[w[i]] = 1 else: lFreq[w[i]] += 1 if i > 0: if w[i-1] not in lCorel.keys(): lCorel[w[i-1]] = {} if w[i] not in lCorel[w[i-1]].keys(): lCorel[w[i-1]][w[i]] = 1 else: lCorel[w[i-1]][w[i]] += 1 else: # first letter in a word if " " not in lCorel.keys(): lCorel[" "] = {} if w[i] not in lCorel[" "].keys(): lCorel[" "][w[i]] = 1 else: lCorel[" "][w[i]] += 1 #print(wList) print("Word lentghs") print(wLens) print("Letter frequency") #print(lFreq) print("\t" + str(sorted(lFreq.items(), key=lambda s : -s[1])[:8]) + "...") print("Letter corelation") #print(lCorel) for c in lCorel.keys(): print("\t" + c + ":\t" + str(sorted(lCorel[c].items(), key=lambda s : -s[1])[:8]) + "...") print("\n\n") print(generateWords(40))