import re
# 读取txt文件并解析每一行记录
records = []
with open(r'C:\scan\all.txt', 'r',encoding='utf8') as f:
    for line in f:
        # fields = line.strip().split('\t')
        fields = re.split(r'\t+', line.strip())
        records.append({
            'filepath': fields[0],
            'filename': fields[1],
            'filesize': fields[2],
            'md5sum': fields[3]

        })

print(records[0])

#
# 查找相同的文件记录
duplicate_records = []
seen_md5sums = set()  # 记录已经处理过的MD5值
for i, record1 in enumerate(records):
    for j in range(i+1, len(records)):
        record2 = records[j]
        if (record1['filename'] == record2['filename'] and
            record1['filesize'] == record2['filesize'] and
            record1['md5sum'] == record2['md5sum']):
            # 找到相同的文件记录
            if record1['md5sum'] not in seen_md5sums:
                duplicate_records.append([record1, record2])
                seen_md5sums.add(record1['md5sum'])

# 输出相同的文件记录
for records in duplicate_records:
    print('Found duplicate files:')
    for record in records:
        print(f"Filename: {record['filename']}")
        print(f"Filesize: {record['filesize']}")
        print(f"MD5sum: {record['md5sum']}")
        print(f"Filepath: {record['filepath']}")
    print('')
#

标签: none

添加新评论