import re
# 读取txt文件并解析每一行记录
records = []
with open(r'C:\scan\all.txt', 'r',encoding='utf8') as f:
for line in f:
# fields = line.strip().split('\t')
fields = re.split(r'\t+', line.strip())
records.append({
'filepath': fields[0],
'filename': fields[1],
'filesize': fields[2],
'md5sum': fields[3]
})
print(records[0])
#
# 查找相同的文件记录
duplicate_records = []
seen_md5sums = set() # 记录已经处理过的MD5值
for i, record1 in enumerate(records):
for j in range(i+1, len(records)):
record2 = records[j]
if (record1['filename'] == record2['filename'] and
record1['filesize'] == record2['filesize'] and
record1['md5sum'] == record2['md5sum']):
# 找到相同的文件记录
if record1['md5sum'] not in seen_md5sums:
duplicate_records.append([record1, record2])
seen_md5sums.add(record1['md5sum'])
# 输出相同的文件记录
for records in duplicate_records:
print('Found duplicate files:')
for record in records:
print(f"Filename: {record['filename']}")
print(f"Filesize: {record['filesize']}")
print(f"MD5sum: {record['md5sum']}")
print(f"Filepath: {record['filepath']}")
print('')
#