Bulk Generating Cloze Deletions based on Tatoeba sentences and word frequency lists

wizzie · (This post was last modified: Dec-23-2019, 01:00 PM by wizzie.)

It threw the following error

Error:
writerow() takes exactly one argument (4 given)

which I fixed by adding "[" and "]" around the writerow arguments. Then it threw

Error:
NameError: name 'random' is not defined

which I fixed by adding "import random" and it now works.

Thank you so much for your help, it is much appreciated! You saved me many hours of manual work.
The final code is as follows.

import csv

import string

import random

def find_cloze(sentence, frequency_list):
    translator = str.maketrans(string.punctuation, ' '*len(string.punctuation))
    sentence = sentence.translate(translator)

    max_frequency = 20001  # 50k frequency list cut down to 20,000 entries
    min_frequency = max_frequency

    min_word = None
    valid_words = []
    for word in sentence.split():
        if word.isupper() or word.istitle():
            continue  # Skip proper nouns
        if len(word) <= 2:
            continue  # Skip tiny words

        valid_words.append(word)

        word_frequency = int(frequency_list.get(word.lower(), max_frequency))
        if word_frequency < min_frequency:
            min_word = word
            min_frequency = word_frequency

    if min_word:
        return min_word
    else:
        if valid_words:
            return random.choice(valid_words)
        else:
            return None


def make_index(path, delimiter, value=1):
    d = dict()
    with open(path, newline='') as f:
        reader = csv.reader(f, delimiter=delimiter)
        for row in reader:
            d[row[0]] = row[value]
    return d


def generate(target_file, native_file, links_file, frequency_file):
    print("Making indexes ...")

    target = make_index(target_file, '\t', value=2)
    native = make_index(native_file, '\t', value=2)
    links = make_index(links_file, '\t')

    # Make index between word and usage frequency
    frequency = make_index(frequency_file, ' ')

    print("Generating clozes ...")
    with open("out.csv", 'w', newline='') as outfile:
        writer = csv.writer(
            outfile,
            delimiter='\t',
            quotechar='|',
            quoting=csv.QUOTE_MINIMAL)

        # For each target sentence
        for target_number, target_sentence in target.items():
            # Lookup native translation
            native_number = links.get(target_number)
            if not native_number:
                continue  # If no native translation, skip

            native_sentence = native.get(native_number)
            if not native_sentence:
                continue  # If no native translation, skip

            # Find the cloze word
            target_cloze_word = find_cloze(target_sentence, frequency)
            if not target_cloze_word:
                continue  # If no cloze word, skip

            clozed = target_sentence.replace(
                target_cloze_word,
                '{{{{c1::{}}}}}'.format(target_cloze_word)
                )

            writer.writerow(
                [target_number,
                clozed,
                native_number,
                native_sentence])

    print("Done.")

generate('target.csv',
         'native.csv',
         'links.csv',
         'frequency.txt')

Edit: Changed code to be "language agnostic".

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Bulk loading of data using python	shivamsvmsri	2	808	Sep-28-2023, 09:04 AM Last Post: shivamsvmsri
	seaching for a library: nondeterministic letter manipulation in sentences	Myron	2	967	Dec-05-2022, 03:53 PM Last Post: Myron
	Problem: Check if a list contains a word and then continue with the next word	Mangono	2	2,598	Aug-12-2021, 04:25 PM Last Post: palladium
	regex pattern to extract relevant sentences	Bubly	2	1,949	Jul-06-2021, 04:17 PM Last Post: Bubly
	How can I get Python Bulk Email Verification Script With API?	zainalee	1	2,553	Jun-06-2021, 09:19 AM Last Post: snippsat
	Extract specific sentences from text file	Bubly	3	3,529	May-31-2021, 06:55 PM Last Post: Larz60+
	Python Bulk Email Verification Script With API	Aj1128	0	2,682	Nov-28-2020, 11:38 AM Last Post: Aj1128
	Bulk add column to dataframe	sambanerjee	1	2,193	Sep-24-2020, 07:34 PM Last Post: sambanerjee
	Split dict of lists into smaller dicts of lists.	pcs3rd	3	2,523	Sep-19-2020, 09:12 AM Last Post: ibreeden
	bulk update in elasticsearch	pythonlearner1	1	6,079	Jun-10-2020, 10:01 PM Last Post: pythonlearner1

Bulk Generating Cloze Deletions based on Tatoeba sentences and word frequency lists

User Panel Messages

Announcements