multiprocessing phash from every frame in folder

flash77 · Nov-16-2022, 02:09 PM

Dear community,

because I'm new to multiprocessing I need some help...

I've got a function "hanger_detection.create_phash()" which should create a hashstring from a frame.
From each frame in the folder "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/phash_test/" should be created a hashstring using multiprocessing.

(Nov-14-2022, 08:16 PM)deanhystad Wrote: with Pool(processes=processes) as pool: return pool.map(process, range(1, 11))

The above repeats process 10 times.

But unfortunately I'm needing to treat every frame in the folder "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/phash_test/".
I'm trying to iterate over obj to fetch every frame:

    with Pool(processes=4) as pool:
        for entry in obj:
            frame_hash_list = pool.map(hanger_detection.create_phash, entry)

As you see, I'm not sure what I'm doing...
Searching the web is actually not suitable for me, because I'm guessing...

Could you please be so kind and help me out?

Here is my code:
main.py:

import os
import hanger_detection
from multiprocessing import Pool
import time


if __name__ == "__main__":
    measure_time_start = time.time()
    frame_hash_list = []
    hangers = []
    p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/phash_test/"
    obj = os.scandir(p)
    pool = Pool()
    with Pool(processes=4) as pool:
        for entry in obj:
            frame_hash_list = pool.map(hanger_detection.create_phash, entry)
    hangers = hanger_detection.detect_hangers(frame_hash_list)
    number_of_hangers = len(hangers)
    hanger_detection.fill_hanger_information_in_excel(hangers)
    measure_time_end = time.time()
    obj.close()
    print("frame_hash_list: " + str(frame_hash_list))
    print("hangers: " + str(hangers))
    print("number_of_hangers: " + str(number_of_hangers))
    print("Time taken: " + str(measure_time_start - measure_time_end))

hanger_detection.py:

from PIL import Image
import imagehash
import openpyxl
from itertools import zip_longest

p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/phash_test/"


def difference_count(a: str, b: str) -> int:
    """Count differences between a and b"""
    return sum(1 for a, b in zip_longest(a, b) if a != b)


def create_phash(entry):
    # load frames
    frame = Image.open(p + str(entry.name))
    # create pHash
    # Compare hashes to determine whether the frames are the same or not
    frame_phash = str(imagehash.phash(frame))
    return frame_phash


def detect_hangers(frame_hash_list, threshold: int = 0, min_count: int = 4):
    """Return list of "hangers" detected in frame_hash_list.
    A "hanger" is consecutive frames that are the same.

    frame_hash_list : list of frame hash strings.  Frames are considered
    same or different by counting the differences in their hash strings.

    threshold : Maximum number of diffences allowed for two frames to be
    considered "same".

    min_count : Minimum length of a hanger.  Short hangers aren't noticable
    and don't have to be removed.
    """
    hangers = []  # List of hanger start, stop frame indexes
    start_index = 0
    start_frame = frame_hash_list[0]
    for index, frame in enumerate(frame_hash_list[1:], start=1):
        # Are frame and start_frame disimilar enough?
        if difference_count(start_frame, frame) > threshold:
            if index - start_index >= min_count:
                # Add hanger to list
                hangers.append((start_index, index - 1))
            start_frame = frame
            start_index = index
    # Check if we end with a hanger
    if index - start_index > 10:
        hangers.append([start_index, index])
    return hangers


def convert_frame_nr_in_time(d):
    # S8-Movie (avi-file) is checked of hangers
    #####################################################
    # 1 hour contains 72000 frames
    c1 = 72000
    # 1 minute contains 1200 frames
    c2 = 1200
    # 1 second contains 20 frames
    c3 = 20

    def find_even_frame_nr(a, b, c):
        while True:
            if a % c == 0:
                break
            else:
                a -= 1
                b += 1
        return a, b

    frame_nr_full_hour, rest_1 = find_even_frame_nr(d, 0, c1)
    number_of_hours = frame_nr_full_hour / c1
    ###########################################################
    frame_nr_full_minute, rest_2 = find_even_frame_nr(rest_1, 0, c2)
    number_of_minutes = frame_nr_full_minute / c2
    ###########################################################
    frame_nr_full_second, rest_3 = find_even_frame_nr(rest_2, 0, c3)
    number_of_seconds = frame_nr_full_second / c3

    return number_of_hours, number_of_minutes, number_of_seconds


def fill_hanger_information_in_excel(hangers):
    p = "D:/S8_hanger_finder/neuer_Ansatz/aktueller_Versuch/S8-Hanger_Positionen.xlsx"
    fileXLSX = openpyxl.load_workbook(p)
    sheet = fileXLSX["Blatt"]
    # clear old hanger information
    # film doesn't have more than 100 hangers
    r = 5
    c = 2
    for z in range(r, r + 100):
        for s in range(c, c + 2):
            sheet.cell(row=z, column=s).value = None

    # fill in hanger information
    r = 5
    for i in hangers:
        frame_nr_hanger_start = i[0]
        frame_nr_hanger_end = i[1]
        number_of_hours_start, number_of_minutes_start, number_of_seconds_start = convert_frame_nr_in_time(
            frame_nr_hanger_start)
        number_of_hours_end, number_of_minutes_end, number_of_seconds_end = convert_frame_nr_in_time(
            frame_nr_hanger_end)
        number_of_hours_start_int = int(number_of_hours_start)
        number_of_minutes_start_int = int(number_of_minutes_start)
        number_of_seconds_start_int = int(number_of_seconds_start)
        number_of_hours_end_int = int(number_of_hours_end)
        number_of_minutes_end_int = int(number_of_minutes_end)
        number_of_seconds_end_int = int(number_of_seconds_end)
        number_of_hours_start_str = str(number_of_hours_start_int)
        if len(number_of_hours_start_str) == 1:
            number_of_hours_start_str = "0" + number_of_hours_start_str
        number_of_minutes_start_str = str(number_of_minutes_start_int)
        if len(number_of_minutes_start_str) == 1:
            number_of_minutes_start_str = "0" + number_of_minutes_start_str
        number_of_seconds_start_str = str(number_of_seconds_start_int)
        if len(number_of_seconds_start_str) == 1:
            number_of_seconds_start_str = "0" + number_of_seconds_start_str
        number_of_hours_end_str = str(number_of_hours_end_int)
        if len(number_of_hours_end_str) == 1:
            number_of_hours_end_str = "0" + number_of_hours_end_str
        number_of_minutes_end_str = str(number_of_minutes_end_int)
        if len(number_of_minutes_end_str) == 1:
            number_of_minutes_end_str = "0" + number_of_minutes_end_str
        number_of_seconds_end_str = str(number_of_seconds_end_int)
        if len(number_of_seconds_end_str) == 1:
            number_of_seconds_end_str = "0" + number_of_seconds_end_str

        # create timestamp
        timestamp_start_str = number_of_hours_start_str + ":" + number_of_minutes_start_str + ":" + number_of_seconds_start_str
        timestamp_end_str = number_of_hours_end_str + ":" + number_of_minutes_end_str + ":" + number_of_seconds_end_str
        sheet.cell(row=r, column=2).value = timestamp_start_str
        sheet.cell(row=r, column=4).value = timestamp_end_str
        r += 1
    fileXLSX.save(p)

**deanhystad** · (This post was last modified: Nov-17-2022, 02:18 PM by deanhystad.)

Here's a shot. I made an iterator that hashes the frames using a processor Pool. I used Pool.imap() instead of Pool.map() so you don't have to wait for all the frames to be hashed before you start looking for hangers.

from multiprocessing import Pool
from random import randint, random
from time import sleep

def phash_func(x):
    """Pretend frame hash frunction"""
    sleep(random())
    return randint(0, 10), x

def phash_iter(count=1):
    """frame phash iterator.
    Use multi-processing to hash multiple frames in parallel
    """
    pool = Pool(processes=4)
    for result in pool.imap(phash_func, range(count)):
        yield result

def detect_hangers(hash_iter, threshold: int = 3, min_count: int = 4):
    hangers = []
    start_index = 0
    start_frame, start_index = next(hash_iter)
    for frame, index in hash_iter:
        print(index)
        # Are frame and start_frame disimilar enough?
        if abs(start_frame - frame) > threshold:
            if index - start_index >= min_count:
                # Add hanger to list
                hangers.append((start_index, index - 1))
            start_frame = frame
            start_index = index
    # Check if we end with a hanger
    if index - start_index > 10:
        hangers.append([start_index, index])
    return hangers

if __name__ == "__main__":
    print(detect_hangers(phash_iter(100)))

You'd need to replace phash_func() with the real routine to do frame hashing. You also need to modify phash_iter so the iterator provided to the pool iterates through the frame files.

To get the frame hash values in the right order you'll need to pass the frames into the hash_iter in the correct order (order will be maintained). If you are iterating through a folder of frame files, you'll need to sort those files (or the file names) before passing them to phash_func().

flash77 · Nov-20-2022, 02:33 PM

Dear community,

especially deanhystad!

thanks a lot for the great support!!

Because I'm professionally restrained I will get the possibility to work on this project not until two weeks...

I will notice you when I have resumed to this project...

Thanks again...

flash77

flash77 · Apr-19-2023, 12:28 PM

Dear community,

now I have the opportunity to continue working on the project. I read some stuff over multiprocessing and imap (www.superfastpython.com) but I don't get it working.

First, I think, it would be better to concentrate on the iterator (because I hide the hangerdetection).

I replaced the "phash_func()" with the real routine "create_phash()".

The function "create_phash" does create a phash from a frame.

I have difficulties to get the iterator iterate over the frames in the folder.

Could you please tell me what I'm doing wrong?

Thanks a lot for your patience!

import os
from PIL import Image
import imagehash
import time
#import openpyxl
from multiprocessing import Pool

p = "D:/S8_hanger_finder/neuer_Ansatz/phash_test/"


def create_phash(frame):
    # load frames
    frame = Image.open(p + str(frame))
    # create pHash
    # Compare hashes to determine whether the frames are the same or not
    frame_phash = str(imagehash.phash(frame))
    return frame_phash


def phash_iter(d):
    # frame phash iterator
    # use multiprocessing to hash frames in parallel
    pool = Pool(processes=6)
    for result in pool.imap(create_phash, d, chunksize=1):
        yield result

if __name__ == "__main__":
    begin = time.time()
    print(phash_iter(os.listdir(p)))
    end = time.time()
    print(end - begin)

**deanhystad** · (This post was last modified: Apr-19-2023, 05:23 PM by deanhystad.)

Demonstrates how to use iterator/generator. Currently just prints the names of files in the current working directory, but if you change "p" and "create_phash()" to your code it will print strings that are the image hash for the files in your image folder. Those might be really long strings.

import os
from multiprocessing import Pool

p = os.getcwd()


def create_phash(file):
    return file


def phash_iter(files, processes=6):
    # frame phash iterator
    # use multiprocessing to hash frames in parallel
    pool = Pool(processes=processes)
    for result in pool.imap(create_phash, files, chunksize=1):
        yield result

if __name__ == "__main__":
    for phash in phash_iter(os.listdir(p)):
        print(phash)

flash77 · (This post was last modified: Apr-21-2023, 07:16 AM by flash77.)

Hi,
thank you for the example to get all the hash strings.

It is still necessary to to analyze the hash strings in detect_hangers() to find the hangers.
I tried to modify the function detect_hangers().

start_phash, start_index = next(phash_iter)

But in line 42 the interpreter says, that he expected 2 values (start_phash, start_index). But he gets too many values...
I'm thinking that "next" steps through all hash strings one by one...
Like:

For phash in phash_iter(all_frames)

Could you please tell me why next is not working properly?

Geetings...

import os
from PIL import Image
import imagehash
import time
import openpyxl
from multiprocessing import Pool
p = "D:/S8_hanger_finder/neuer_Ansatz/phash_test/"


def create_phash(frame):
    # load frames
    frame = Image.open(p + str(frame))
    # create pHash
    # Compare hashes to determine whether the frames are the same or not
    phash = str(imagehash.phash(frame))
    return phash


def phash_iter(d):
    # frame phash iterator
    # use multiprocessing to hash frames in parallel
    pool = Pool(processes=6)
    for result in pool.imap(create_phash, d, chunksize=1):
        yield result


def detect_hangers(phash_iter, threshold: int = 3, min_count: int = 4):
    """Return list of "hangers" detected in frame_hash_list.
    A "hanger" is consecutive frames that are the same.

    frame_hash_list : list of frame hash strings.  Frames are considered
    same or different by counting the differences in their hash strings.

    threshold : Maximum number of diffences allowed for two frames to be
    considered "same".

    min_count : Minimum length of a hanger.  Short hangers aren't noticable
    and don't have to be removed.
    """
    hangers = []  # List of hanger start, stop frame indexes
    start_index = 0
    start_phash, start_index = next(phash_iter)
    for phash, index in phash_iter:
        print(index)
        # Are frame and start_frame disimilar enough?
        if abs(start_phash - phash) > threshold:
            if index - start_index >= min_count:
                # Add hanger to list
                hangers.append((start_index, index - 1))
            start_phash = phash
            start_index = index
    # Check if we end with a hanger
    if index - start_index > 10:
        hangers.append([start_index, index])
    return hangers


if __name__ == "__main__":
    begin = time.time()
    all_frames = os.listdir(p)

    hangers = detect_hangers(phash_iter(all_frames))

    print(hangers)
    end = time.time()
    print(end - begin)

I'm very thankful for your excellent help!

**deanhystad** · (This post was last modified: Apr-21-2023, 07:50 PM by deanhystad.)

This expects a phash and an index returned.

start_phash, start_index = next(phash_iter)

Your iterator does not return an index.

yield result

Nore does the phash function.

return frame_phash

In my earlier example on which you appear to be basing your code, the phash function returned a phash and an index. This was easy for my example. I needed an iterator for pool.imap, and range() worked great for this.

def phash_func(x):
    """Pretend frame hash frunction"""
    sleep(random())
    return randint(0, 10), x

Your phash function does not return an index. That is fine, you can make an index somewhere else. For example, you could make the index in the iterator/generator.

def phash_iter(d):
    # frame phash iterator
    # use multiprocessing to hash frames in parallel
    pool = Pool(processes=6)
    for index, result in enumerate(pool.imap(create_phash, d, chunksize=1)):
        yield result, index

Or you could make the index where you use the generator.

    for index, phash in enumerate(phash_iter):
        # Are frame and start_frame disimilar enough?

flash77 · Apr-24-2023, 08:35 AM

Dear deanhystad, I think I got it now.
I modified "phash_iter" and "detect_hangers". Also, I used "difference_count" from an earlier example to calculate the differences of the "hash strings". Now I can experiment with "threshold" and "min_count". I can also adjust the "chunksize" to the respective number of images.
Thank you so much for the great support!! Smile

import os
from PIL import Image
import imagehash
import time
import openpyxl
from multiprocessing import Pool
from itertools import zip_longest
p = "D:/S8_hanger_finder/neuer_Ansatz/phash_test/"


def create_phash(frame):
    # load frames
    frame = Image.open(p + str(frame))
    # create pHash
    # Compare hashes to determine whether the frames are the same or not
    phash = str(imagehash.phash(frame))
    return phash


def difference_count(a: str, b: str) -> int:
    # count differences between a and b
    return sum(1 for a, b in zip_longest(a, b) if a != b)


def phash_iter(d):
    # frame phash iterator
    # use multiprocessing to hash frames in parallel
    pool = Pool(processes=6)
    for index, phash in enumerate(pool.imap(create_phash, d, chunksize=1)):
        yield index, phash


def detect_hangers(phash_iter, threshold: int = 2, min_count: int = 4):
    """Return list of "hangers" detected in frame_hash_list.
    A "hanger" is consecutive frames that are the same.

    frame_hash_list : list of frame hash strings.  Frames are considered
    same or different by counting the differences in their hash strings.

    threshold : Maximum number of diffences allowed for two frames to be
    considered "same".

    min_count : Minimum length of a hanger.  Short hangers aren't noticable
    and don't have to be removed.
    """
    hangers = []  # List of hanger start, stop frame indexes
    start_index, start_phash = next(phash_iter)
    for index, phash in phash_iter:
        print(index, phash)
        # Are frame and start_frame disimilar enough?
        if difference_count(start_phash, phash) > threshold:
            if index - start_index >= min_count:
                # Add hanger to list
                hangers.append((start_index, index - 1))
            start_phash = phash
            start_index = index
    # Check if we end with a hanger
    if index - start_index > 10:
        hangers.append([start_index, index])
    return hangers


if __name__ == "__main__":
    begin = time.time()
    all_frames = os.listdir(p)
    hangers = detect_hangers(phash_iter(all_frames))
    end = time.time()

    print('hangers:')
    print(hangers)
    numberOfHangers = len(hangers)
    print('number of hangers: ' + str(numberOfHangers))
    print('time: ' + str(end - begin))

Possibly Related Threads…
Thread		Author	Replies	Views	Last Post
	Compare folder A and subfolder B and display files that are in folder A but not in su	Melcu54	3	1,549	Jan-05-2024, 05:16 PM Last Post: Pedroski55
	Compare filename with folder name and copy matching files into a particular folder	shantanu97	2	6,552	Dec-18-2021, 09:32 PM Last Post: Larz60+
	Move file from one folder to another folder with timestamp added end of file	shantanu97	0	3,188	Mar-22-2021, 10:59 AM Last Post: shantanu97
	Python Cut/Copy paste file from folder to another folder	rdDrp	4	6,945	Aug-19-2020, 12:40 PM Last Post: rdDrp
	Delete directories in folder is not working after folder is updated	asheru93	2	3,442	Feb-13-2019, 12:37 PM Last Post: asheru93
	copy content of folder to existing folder	shlomi27	0	3,072	Aug-11-2018, 01:44 PM Last Post: shlomi27

multiprocessing phash from every frame in folder

User Panel Messages

Announcements