Creating a word length frequency table in python

I have the following code:

import re

def get_filename():
    """gets the file"""
    filename = input("Please enter filename: ")
    return filename
    
def get_words_from_file(filename):
    """getting the data and printing it word by word"""
    infile = open(filename, 'r', encoding='utf-8')
    outfile = infile.read().splitlines()
    words = []
    reading = False
    for let in outfile:
        if let.startswith("*** START OF")and reading == False:
            reading = True
        elif let.startswith("*** END OF SYNTHETIC TEST CASE ***") or let.startswith("*** END"):
            return words
        elif reading:
            let = let.lower()
            words.extend(re.findall("[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+", let))
    return words

def calculate(words):
    """gjhwjghwg2"""
    all_times = []
    max_word_length = 0
    number_of_words = len(words)
    average = sum(len(word) for word in words) / number_of_words
    for word in words:
        if len(word)>max_word_length:
            max_word_length=len(word)
    frequency = {word: 0 for word in words}
    for word in words:
        frequency[word] += 1
    max_frequency = max(frequency.values())
    
    result = (number_of_words, average, max_word_length, max_frequency)
    return result

def get_frequency(words):
    """ghjhgwejhgwjgw"""
    len_count = []
    frequency_1 = {word: 0 for word in words}
    for word in words:
        frequency_1[word] += 1
    answer = (frequency_1, len_count)
    return answer

def print_results(stats_tuple, lengthy):
    """calculate the goods"""
    (frequency_1, len_count) = lengthy
    (number_of_words, average, max_word_length, max_frequency) = stats_tuple
    print("")
    print("Word summary (all words):")
    print(" Number of words = {0}".format(number_of_words))
    print(" Average word length = {:.2f}".format(average))
    print(" Maximum word length = {0}".format(max_word_length))
    print(" Maximum frequency = {0}".format(max_frequency))
    print("")
    print(" Len  Freq")
    for word_len in range(1, max(len_count) + 1):
        print(f'{frequency_1}t{len_count.get(frequency_1, 0)}')
    

def main():
    """ghkghwgjkwhgw"""
    filename = get_filename()
    data = get_words_from_file(filename)
    stats = calculate(data)
    lengthy = get_frequency(data)
    print_results(stats, lengthy)
main()

JavaScript
​x
 
import re
​
def get_filename():
    """gets the file"""
    filename = input("Please enter filename: ")
    return filename
    
def get_words_from_file(filename):
    """getting the data and printing it word by word"""
    infile = open(filename, 'r', encoding='utf-8')
    outfile = infile.read().splitlines()
    words = []
    reading = False
    for let in outfile:
        if let.startswith("*** START OF")and reading == False:
            reading = True
        elif let.startswith("*** END OF SYNTHETIC TEST CASE ***") or let.startswith("*** END"):
            return words
        elif reading:
            let = let.lower()
            words.extend(re.findall("[a-z]+[-'][a-z]+|[a-z]+[']?|[a-z]+", let))
    return words
​
def calculate(words):
    """gjhwjghwg2"""
    all_times = []
    max_word_length = 0
    number_of_words = len(words)
    average = sum(len(word) for word in words) / number_of_words
    for word in words:
        if len(word)>max_word_length:
            max_word_length=len(word)
    frequency = {word: 0 for word in words}
    for word in words:
        frequency[word] += 1
    max_frequency = max(frequency.values())
    
    result = (number_of_words, average, max_word_length, max_frequency)
    return result
​
def get_frequency(words):
    """ghjhgwejhgwjgw"""
    len_count = []
    frequency_1 = {word: 0 for word in words}
    for word in words:
        frequency_1[word] += 1
    answer = (frequency_1, len_count)
    return answer
​
def print_results(stats_tuple, lengthy):
    """calculate the goods"""
    (frequency_1, len_count) = lengthy
    (number_of_words, average, max_word_length, max_frequency) = stats_tuple
    print("")
    print("Word summary (all words):")
    print(" Number of words = {0}".format(number_of_words))
    print(" Average word length = {:.2f}".format(average))
    print(" Maximum word length = {0}".format(max_word_length))
    print(" Maximum frequency = {0}".format(max_frequency))
    print("")
    print(" Len  Freq")
    for word_len in range(1, max(len_count) + 1):
        print(f'{frequency_1}t{len_count.get(frequency_1, 0)}')
    
​
def main():
    """ghkghwgjkwhgw"""
    filename = get_filename()
    data = get_words_from_file(filename)
    stats = calculate(data)
    lengthy = get_frequency(data)
    print_results(stats, lengthy)
main()
​

Without importing anything else, how would I make a table which prints the length and then the frequency.

For example: In a file with the text of “a blah ba ba” it would print:

JavaScript
 
Len  Freq
   1     1
   2     2
   3     0
   4     1
​

What confuses me about this is how to add all the length of the words together, should I be making a new list with all the same length of words and then counting the length of that list, or is there a better way to do it.

Answer

len_count = {}
with open(filename, "r") as file:
    for line in file:
        for word in line.split():
            word_len = len(word)
            if not word_len in len_count:
                len_count[word_len] = 1
            else:
                len_count[word_len] += 1

JavaScript
 
len_count = {}
with open(filename, "r") as file:
    for line in file:
        for word in line.split():
            word_len = len(word)
            if not word_len in len_count:
                len_count[word_len] = 1
            else:
                len_count[word_len] += 1
​

Then you can print the two columns:

print("LentFreq")
for word_len in range(1, max(len_count) + 1):
    print(f'{word_len}t{len_count.get(word_len, 0)}')

JavaScript
 
print("LentFreq")
for word_len in range(1, max(len_count) + 1):
    print(f'{word_len}t{len_count.get(word_len, 0)}')
​

EDIT: To handle empty files:

max_len_count = 0 if not len_count else max(len_count)
for word_len in range(1, max_len_count + 1):
    print(f'{word_len}t{len_count.get(word_len, 0)}')

JavaScript
 
max_len_count = 0 if not len_count else max(len_count)
for word_len in range(1, max_len_count + 1):
    print(f'{word_len}t{len_count.get(word_len, 0)}')
​

Advertisement

Answer