Python - Replace contents accros files

張庭瑋 Gary Chang 2025/03/03 15:00:00
11

Intro

 

Sometimes, we encounter inconsistent wording in provided documents. By using a script for batch processing, we could ensure perfect consistency with zero missed updates.

 

 

Work flow

 

We would go though following steps by manual work

 

1.    Find all files

 

2.    Open file

 

3.    Search contents

 

4.    Do the replacement

 

5.    Save to a new file

 

6.    Start over until work done

 

To clearly illustrate the script, we'll start with the txt file since they are easier to read

 

import os

input_folder =  'example_read'
out_folder = os.path.join(input_folder , 'example_out')

os.makedirs(out_folder , exist_ok=True)

for filename in os.listdir(input_folder):
    if filename.endswith('.txt'):
        file_path = os.path.join(input_folder, filename)
        with open(file_path , 'r' , encoding= 'utf-8') as file:
            content = file.read()
        if '一月到四月' in content:
            modified_content = content.replace('一月到四月', '第一季 ')

            new_filename = filename.replace('.txt' , '_corrected.txt')
            new_file_path = os.path.join(out_folder, new_filename)

            with open(new_file_path , 'w' , encoding='utf-8') as file:
                file.write(modified_content)
            
            print(f'{filename} is processed and saved as {new_filename}')
        else:
            print(f'{filename} do not contain 「一月到四月」')
print('Job done')

 

Before

 

 

After

 

 

 

Combination with Pandas

 

import os
import pandas as pd

def replace_texts_in_excel_files(
    input_folder='./example_read', 
    search_texts=["Banana" , "Pinapple"],
    replacement_text='Yellow Fruits',
    output_subfolder='example_out', 
    file_extension='.xlsx'
):
    # 1) Create 
    output_folder = os.path.join(input_folder, output_subfolder)
    os.makedirs(output_folder, exist_ok=True)

    # 2) Look at every file 
    for filename in os.listdir(input_folder):
        if filename.endswith(file_extension):
            file_path = os.path.join(input_folder, filename)
            
            # 3) Load the Excel file 
            df = pd.read_excel(file_path, engine='openpyxl')
            modified = False  # to prevent reprocessing
            
            # 4) Check each column in the file
            for col in df.columns:
                # If the column is made of text data
                if pd.api.types.is_string_dtype(df[col]):
                    # Go through each word we want to replace
                    for search_text in search_texts:
                        # If the column has that word, replace it
                        if df[col].str.contains(search_text, na=False).any():
                            df[col] = df[col].str.replace(search_text, replacement_text, regex=False)
                            modified = True
            
            # 5) If changes were made, save to a new Excel file
            if modified:
                new_filename = filename.replace('.xlsx', '_corrected.xlsx')
                new_file_path = os.path.join(output_folder, new_filename)
                df.to_excel(new_file_path, index=False)
                print(f"{filename} is updated and saved as {new_filename}")
            else:
                print(f"{filename} contains none of the target words. No changes made.")

    print("All files have been processed!")
replace_texts_in_excel_files()

 

Before

 

 

 

We are analyzing the market based on fruit colors. With a single click, we’ve successfully replaced “Apple” with “Red Fruits.” However, replacing “Banana” and “Pinapple” requires 2 separate actions each, totaling 10 manual replacements. This process is repetitive and inefficient—can we streamline it for faster, more accurate results?

 

 

After

 

 

 

張庭瑋 Gary Chang