Python - Replace contents accros files
Intro
Sometimes, we encounter inconsistent wording in provided documents. By using a script for batch processing, we could ensure perfect consistency with zero missed updates.
Work flow
We would go though following steps by manual work
1. Find all files
2. Open file
3. Search contents
4. Do the replacement
5. Save to a new file
6. Start over until work done
To clearly illustrate the script, we'll start with the txt file since they are easier to read
import os
input_folder = 'example_read'
out_folder = os.path.join(input_folder , 'example_out')
os.makedirs(out_folder , exist_ok=True)
for filename in os.listdir(input_folder):
if filename.endswith('.txt'):
file_path = os.path.join(input_folder, filename)
with open(file_path , 'r' , encoding= 'utf-8') as file:
content = file.read()
if '一月到四月' in content:
modified_content = content.replace('一月到四月', '第一季 ')
new_filename = filename.replace('.txt' , '_corrected.txt')
new_file_path = os.path.join(out_folder, new_filename)
with open(new_file_path , 'w' , encoding='utf-8') as file:
file.write(modified_content)
print(f'{filename} is processed and saved as {new_filename}')
else:
print(f'{filename} do not contain 「一月到四月」')
print('Job done')
Before
After
Combination with Pandas
import os
import pandas as pd
def replace_texts_in_excel_files(
input_folder='./example_read',
search_texts=["Banana" , "Pinapple"],
replacement_text='Yellow Fruits',
output_subfolder='example_out',
file_extension='.xlsx'
):
# 1) Create
output_folder = os.path.join(input_folder, output_subfolder)
os.makedirs(output_folder, exist_ok=True)
# 2) Look at every file
for filename in os.listdir(input_folder):
if filename.endswith(file_extension):
file_path = os.path.join(input_folder, filename)
# 3) Load the Excel file
df = pd.read_excel(file_path, engine='openpyxl')
modified = False # to prevent reprocessing
# 4) Check each column in the file
for col in df.columns:
# If the column is made of text data
if pd.api.types.is_string_dtype(df[col]):
# Go through each word we want to replace
for search_text in search_texts:
# If the column has that word, replace it
if df[col].str.contains(search_text, na=False).any():
df[col] = df[col].str.replace(search_text, replacement_text, regex=False)
modified = True
# 5) If changes were made, save to a new Excel file
if modified:
new_filename = filename.replace('.xlsx', '_corrected.xlsx')
new_file_path = os.path.join(output_folder, new_filename)
df.to_excel(new_file_path, index=False)
print(f"{filename} is updated and saved as {new_filename}")
else:
print(f"{filename} contains none of the target words. No changes made.")
print("All files have been processed!")
replace_texts_in_excel_files()
Before
We are analyzing the market based on fruit colors. With a single click, we’ve successfully replaced “Apple” with “Red Fruits.” However, replacing “Banana” and “Pinapple” requires 2 separate actions each, totaling 10 manual replacements. This process is repetitive and inefficient—can we streamline it for faster, more accurate results?
After