Така...
Понеже се оказа, че скрипта не работи както трябва - някакви безмислици в output-а - трябваше да го доизчистя.
Резултатите от fdupes:
real 0m9.005s
user 0m1.040s
sys 0m3.668s
Резултатите от скрипта:
real 0m3.786s
user 0m3.852s
sys 0m1.119s
Паралелната обработка на файловете си казва думата.
Пуснах го да рови в локалната директория с музика на единия лаптоп дето го пиша на него.
Не съм правил промените свързани с предложението за проверка на crc32.
Тепърва ще пробвам и това, а след това ще пробвам и на едно ядро, но с async щуротийките. Няма да се учудя, ако този вариант се окаже по-бърз.
Скрипта в момента е в този си вид:
#!/usr/bin/env python3
# Find and prints duplicated files based on their md5 sum
#
import argparse
from collections import defaultdict
from concurrent import futures
from hashlib import md5
import json
import mmap
import os.path
import os
import subprocess
import sys
parser = argparse.ArgumentParser(description='Print duplicated files')
parser.add_argument('path',
type=str,
default='.',
nargs='?',
help='Path to a directory to scan')
parser.add_argument('-d', '--dump',
dest='res_file',
type=str,
default='./duplicated.json',
help='''Path/filename to store the results
Default: ./duplicated.json''')
args = parser.parse_args()
path = args.path
result_file = args.res_file
def human_fsize(size):
"""Return file size in human readable format.
Argument: file's size
Type: int
"""
pref = [('B', 1), ('KB', 1024),
('MB', 1024**2), ('GB', 1024**3), ('TB', 1024**4)]
counter = 0
res = size
while True:
res = res / 1024
if res < 1:
break
else:
counter += 1
if size > 1024:
h_size = round(size / pref[counter][1], 3)
else:
h_size = size
prefix = pref[counter][0]
return f'{h_size} {prefix}'
def file_hash(file_name):
"""Returns a tuple of md5sum and file name.
Argument: file's name
Type: str
"""
if os.stat(file_name).st_size == 0:
return
hasher = md5()
# chunk_size = 1024**2
with open(file_name, 'rb') as in_file:
with mmap.mmap(in_file.fileno(), 0, access=mmap.ACCESS_READ) as mapped:
while True:
chunk = mapped.read()
if chunk:
hasher.update(chunk)
else:
break
# while True:
# chunk = in_file.read(chunk_size)
# if chunk:
# hasher.update(chunk)
# else:
# break
md5sum = hasher.hexdigest()
return md5sum, file_name
def tty_clear():
try:
subprocess.call(['cls'])
except FileNotFoundError:
subprocess.call(['clear'])
try:
lblue, purple, default = ('\033[94m', '\033[35m', '\033[0m')
# Switch the terminal cursor off
subprocess.call(['setterm', '-cursor', 'off'])
print('Matching file sizes!\n')
# Walk through the directories and look for equal file sizes
cl ="\x1b[K" # clear the tty line
counter = 0 # number of equal sized files
eq_sized = defaultdict(list)
total_files = 0
for root, dirs, files in os.walk(path):
total_files += len(files)
for name in files:
counter += 1
size = os.stat(os.path.join(root, name)).st_size
eq_sized[size].append(os.path.join(root, name))
if len(name) > 60:
print(f'\r{cl} {counter:<12}{name[:25]} ..... {name[-25:]}\r', end='', flush=True)
else:
print(f'\r{cl} {counter:<12}{name}\r', end='', flush=True)
for_hashing = {}
for key, value in eq_sized.items():
if len(value) > 1:
for_hashing[key] = value
counter = sum([len(values) for keys, values in for_hashing.items()])
tty_clear()
print(f'Calculating md5 sums.\n')
# Hashing the files
duped = {}
hashed = defaultdict(list)
num_files = len(for_hashing)
count = 0
with futures.ProcessPoolExecutor(max_workers=8) as executor:
for size, files in for_hashing.items():
for result in executor.map(file_hash, files):
if result:
hashed[result[0]].append(result[1])
count += 1
print(f"\r{cl} Done {lblue}{round(count / counter * 100, 3)}%{default} --> {count} from {counter}", end='', flush=True)
if len(hashed[result[0]]) > 1:
duped[result[0]] = hashed[result[0]]
# tty_clear()
print()
# Print the results
sizes = 0
duplicates = 0
for key, values in duped.items():
size = os.stat(values[0]).st_size
sizes += (len(values) - 1) * size
duplicates += len(values) - 1
print(f'md5: {lblue}{key}{default} size: {lblue}{len(values)} {default}* {lblue}{human_fsize(size)}')
for v in values:
print(f' * {purple}{v}{lblue}')
print(f'{default}')
# Dump the results in a json file
with open(result_file, 'w', encoding='utf-8') as dump_file:
json.dump(duped, dump_file, indent=4, ensure_ascii=False)
print(f'Dumped as JSON in: {lblue}{result_file}{default}\n')
print(f'Summarize:\n')
print(f' Found {lblue}{total_files}{default} foles in {purple}{path}{default}.')
print(f' Found {lblue}{len(duped)}{default} files with {lblue}{duplicates} duplicates.\n')
print(f'{default}Deleting the duplicates will free {human_fsize(sizes)}.')
# Switch the terminal cursor on
subprocess.call(['setterm', '-cursor', 'on'])
except KeyboardInterrupt:
print(f'\n Stoped!\n Keyboard interupt.')
# Switch the terminal cursor on
subprocess.call(['setterm', '-cursor', 'on'])
Между другото, не зная защо съм вмъкнал и windows-ката команда за чистене на екрана - 'cls' - след като малко по-надолу ползвам 'setterm', кояко съм сигурен я нама в тази система.