![]() |
How to parse and group hierarchical list items from an unindented string in Python? - Printable Version +- Python Forum (https://python-forum.io) +-- Forum: Python Coding (https://python-forum.io/forum-7.html) +--- Forum: Homework (https://python-forum.io/forum-9.html) +--- Thread: How to parse and group hierarchical list items from an unindented string in Python? (/thread-41853.html) |
How to parse and group hierarchical list items from an unindented string in Python? - ann23fr - Mar-27-2024 Problem Statement
Goal The goal is to create a Python method that takes an unindented string as input, identifies the top-level items and their associated lower-level items based on the specified criteria, concatenates them into a single string for each top-level item while maintaining the original formatting and delimiters, and returns the resulting grouped list items as a Python list. The output list should match the desired format, with each element representing a top-level item and its associated lower-level items. Request Please provide an explanation and guidance on how to create a Python method that can successfully achieve the goal outlined above. The explanation should include the steps involved, any necessary data structures or algorithms, and considerations for handling different scenarios and edge cases. Additional Details
Code Attempts: Attempt 1: def process_list_hierarchy(text): # Helper function to determine the indentation level def get_indentation_level(line): return len(line) - len(line.lstrip()) # Helper function to parse the input text into a list of lines with their hierarchy levels def parse_hierarchy(text): lines = text.split('\n') hierarchy = [] for line in lines: if line.strip(): # Ignore empty lines level = get_indentation_level(line) hierarchy.append((level, line.strip())) return hierarchy # Helper function to build a tree structure from the hierarchy levels def build_tree(hierarchy): tree = [] stack = [(-1, tree)] # Start with a dummy root level for level, content in hierarchy: # Find the correct parent level while stack and stack[-1][0] >= level: stack.pop() # Create a new node and add it to its parent's children node = {'content': content, 'children': []} stack[-1][1].append(node) stack.append((level, node['children'])) return tree # Helper function to combine the tree into a single list def combine_tree(tree, combined_list=[], level=0): for node in tree: combined_list.append((' ' * level) + node['content']) if node['children']: combine_tree(node['children'], combined_list, level + 1) return combined_list # Parse the input text into a hierarchy hierarchy = parse_hierarchy(text) # Build a tree structure from the hierarchy tree = build_tree(hierarchy) # Combine the tree into a single list while maintaining the hierarchy combined_list = combine_tree(tree) # Return the combined list as a string return '\n'.join(combined_list)Attempt 2: def organize_hierarchically(items): def get_level(item): match = re.match(r'^(\d+\.?|\-|\*)', item) return len(match.group()) if match else 0 grouped_items = [] for level, group in groupby(items, key=get_level): if level == 1: grouped_items.append('\n'.join(group)) else: grouped_items[-1] += '\n' + '\n'.join(group) return grouped_itemsAttempt 3: from bs4 import BeautifulSoup import nltk def extract_sub_objectives(input_text): soup = BeautifulSoup(input_text, 'html.parser') text_content = soup.get_text() # Tokenize the text into sentences sentences = nltk.sent_tokenize(text_content) # Initialize an empty list to store the sub-objectives sub_objectives = [] # Iterate through the sentences and extract sub-objectives current_sub_objective = "" for sentence in sentences: if sentence.startswith(("1.", "2.", "3.", "4.")): if current_sub_objective: sub_objectives.append(current_sub_objective) current_sub_objective = "" current_sub_objective += sentence + "\n" elif current_sub_objective: current_sub_objective += sentence + "\n" # Append the last sub-objective, if any if current_sub_objective: sub_objectives.append(current_sub_objective) return sub_objectivesAttempt 4: def extract_sub_objectives(input_text, preserve_formatting=False): # Modified to strip both single and double quotes input_text = input_text.strip('\'"') messages = [] messages.append("Debug: Starting to process the input text.") # Debug message to show the input text after stripping quotes messages.append(f"Debug: Input text after stripping quotes: '{input_text}'") # Define possible starting characters for new sub-objectives start_chars = [str(i) + '.' for i in range(1, 100)] # Now includes up to two-digit numbering messages.append(f"Debug: Start characters defined: {start_chars}") # Define a broader range of continuation characters continuation_chars = ['-', '*', '+', '•', '>', '→', '—'] # Expanded list messages.append(f"Debug: Continuation characters defined: {continuation_chars}") # Replace escaped newline characters with actual newline characters input_text = input_text.replace('\\n', '\n') # Split the input text into lines lines = input_text.split('\n') messages.append(f"Debug: Input text split into lines: {lines}") # Initialize an empty list to store the sub-objectives sub_objectives = [] # Initialize an empty string to store the current sub-objective current_sub_objective = '' # Initialize a counter for the number of continuations in the current sub-objective continuation_count = 0 # Function to determine if a line is a new sub-objective def is_new_sub_objective(line): # Strip away leading quotation marks and whitespace line = line.strip('\'"').strip() return any(line.startswith(start_char) for start_char in start_chars) # Function to determine if a line is a continuation def is_continuation(line, prev_line): if not prev_line: return False # Check if the line starts with an alphanumeric followed by a period or parenthesis if len(line) > 1 and line[0].isalnum() and (line[1] == '.' or line[1] == ')'): # Check if it follows the sequence of the previous line if line[0].isdigit() and prev_line[0].isdigit() and int(line[0]) == int(prev_line[0]) + 1: return False elif line[0].isalpha() and prev_line[0].isalpha() and ord(line[0].lower()) == ord(prev_line[0].lower()) + 1: return False else: return True # Add a condition to check for lower-case letters followed by a full stop if line[0].islower() and line[1] == '.': return True return any(line.startswith(continuation_char) for continuation_char in continuation_chars) # Iterate over each line for i, line in enumerate(lines): prev_line = lines[i - 1] if i > 0 else '' # Check if the line is a new sub-objective if is_new_sub_objective(line): messages.append(f"Debug: Found a new sub-objective at line {i + 1}: '{line}'") # If we have a current sub-objective, check the continuation count if current_sub_objective: if continuation_count < 2: messages.append(f"Debug: Sub-objective does not meet the continuation criterion: '{current_sub_objective}'") for message in messages: print(message) return None # Check the preserve_formatting parameter before adding sub_objectives.append( current_sub_objective.strip() if not preserve_formatting else current_sub_objective) messages.append(f"Debug: Added a sub-objective to the list. Current count: {len(sub_objectives)}.") # Reset the current sub-objective to the new one and reset the continuation count current_sub_objective = line continuation_count = 0 # Check if the line is a continuation elif is_continuation(line, prev_line): messages.append(f"Debug: Line {i + 1} is a continuation of the previous line: '{line}'") # Add the line to the current sub-objective, checking preserve_formatting current_sub_objective += '\n' + line if preserve_formatting else ' ' + line.strip() # Increment the continuation count continuation_count += 1 # Handle lines that are part of the current sub-objective but don't start with a continuation character elif current_sub_objective: messages.append(f"Debug: Line {i + 1} is part of the current sub-objective: '{line}'") # Add the line to the current sub-objective, checking preserve_formatting current_sub_objective += '\n' + line if preserve_formatting else ' ' + line.strip() # If we have a current sub-objective, check the continuation count before adding it to the list if current_sub_objective: if continuation_count < 2: messages.append(f"Debug: Sub-objective does not meet the continuation criterion: '{current_sub_objective}'") for message in messages: print(message) return None # Check the preserve_formatting parameter before adding sub_objectives.append(current_sub_objective.strip() if not preserve_formatting else current_sub_objective) messages.append(f"Debug: Added the final sub-objective to the list. Final count: {len(sub_objectives)}.") # Print the debug messages if no sub-objectives are found if not sub_objectives: for message in messages: print(message) return sub_objectivesSample Data (Inputs and associated Outputs): https://pastebin.com/s8nWktbZ RE: How to parse and group hierarchical list items from an unindented string in Python? - Pedroski55 - May-23-2024 Just using Input1 as text, I tried the code below. First I saved Input1 as input.txt For better formatting maybe you should use the Python module docx, which has more options. I doubt if all your text files have the same style, so you may need some more re expressions! That depends on what is in your other texts. import re # you can do the following with Python but do it like this for now for testing # copy Input1 text from sample_data_strings.txt and paste it into a text editor # remove " at both ends, just ' remains # put [ at the beginning and ] at the end # now you have a list. # call the list l # Paste the list into your Python IDE: l = ['Opening: ... players.'] # save the list l # savepath = '/home/pedro/myPython/re/text_files/unformatted_text1.txt' # with open(savepath, 'w') as output: output.writelines(l) # now you have your text as lines of text, open with readlines() # path to the unformatted lines of text from above Input1 = '/home/pedro/myPython/re/text_files/unformatted_text1.txt' # get the text as a list with open(Input1, 'r') as infile: text_list = infile.readlines() # have a look for line in text_list: print(line) # patterns to look for p = re.compile(r'([A-Za-z]+:)') # get text followed by : like: Opening: endline = re.compile(r'(:)$') # get end of line : num = re.compile(r'(\d+\.)') # get a number or numbers followed by . numsub = re.compile(r'(\w\.)') # get number subpoints \w followed by . subsub = re.compile(r'(- )') # get the number subpoint marker - if __name__ == "__main__": # match only looks at the beginning of each line # search looks through the whole line # re.compile(r'(:)$') finds : at the end of a line, if : is there for i in range(len(text_list)): res = p.match(text_list[i]) end = endline.search(text_list[i]) number = num.match(text_list[i]) numsubp = numsub.match(text_list[i]) subp = sub.match(text_list[i]) if res and end: text_list[i] = ' ' + text_list[i] elif res and not end: text_list[i] = ' ' + text_list[i] elif number: text_list[i] = '\t' + text_list[i] elif numsubp: text_list[i] = '\t ' + text_list[i] elif subp: text_list[i] = '\t\t ' + text_list[i] elif i == len(text_list) - 1: text_list[i] = '\n\t' + text_list[i] savepath = '/home/pedro/myPython/re/text_files/formatted_text1.txt' with open(savepath, 'w') as output: output.writelines(text_list)You can tweak and change the elifs to get what you want. Add more re expressions for different text parts! |