@inproceedings{558bbff4f3cd42cfa7d66d678d518512,
title = "IterClean: An Iterative Data Cleaning Framework with Large Language Models",
abstract = "In the era of generative artificial intelligence, the accuracy of data is paramount. Erroneous data often leads to faulty outcomes and economic detriments. Previous cleaning methods employ a sequential detect-repair paradigm, leaving over half of the errors unsolved in real scenarios. We introduce IterClean, an iterative data cleaning framework leveraging large language models (LLMs). Utilizing an iterative mechanism, the framework employs a two-step process: data labeling and iterative data cleaning. With few labeled data, IterClean leverages an iterative cleaning process involving an error detector, an error verifier, and an error repairer to significantly enhance the cleaning performance. Extensive experiments across four datasets demonstrate that, IterClean achieves an F1 score that is up to three times higher than the best state-of-the-art approaches requiring only 5 labeled tuples.",
keywords = "Data cleaning, error detection, error repair, large language models",
author = "Wei Ni and Kaihang Zhang and Xiaoye Miao and Xiangyu Zhao and Yangyang Wu and Jianwei Yin",
note = "Publisher Copyright: {\textcopyright} 2024 Owner/Author.; 2024 ACM Turing Award Celebration Conference China, TURC 2024 ; Conference date: 05-07-2024 Through 07-07-2024",
year = "2024",
month = jul,
day = "5",
doi = "10.1145/3674399.3674436",
language = "英语",
series = "ACM International Conference Proceeding Series",
publisher = "Association for Computing Machinery ",
pages = "100--105",
booktitle = "Proceedings of ACM Turing Award Celebration Conference - CHINA 2024, TURC 2024",
address = "美国",
}