Import libraries, get dataframe from the tsv previously made, get offset frame for pyscene to match frames of max peak data¶
import numpy as np
import pandas as pd
import seaborn as sns
df_scenes= pd.read_csv("../output/scenes_data.tsv", sep="\t")
df_max_peak= pd.read_csv("../output/max_peak_data.tsv", sep="\t")
# Subtract 1 to avoid including the first frame of the next scene
df_scenes["offset_frame"] = df_scenes.groupby("episode")["onset_frame"].shift(-1) - 1
# Fill end_frame for the last scene (no next onset) with the maximum frame number
episode_max_frames = df_max_peak.groupby("episode")["episode_frame"].max()
df_scenes["offset_frame"] = df_scenes.apply(
lambda row: episode_max_frames[row["episode"]] if pd.isna(row["offset_frame"]) else row["offset_frame"],
axis=1
)
for precaution
# Ensure consistent and clean episode strings
df_max_peak["episode"] = df_max_peak["episode"].astype(str)
df_scenes["episode"] = df_scenes["episode"].astype(str)
# make sure episode_frame and onset_frame are integers
df_max_peak["episode_frame"] = df_max_peak["episode_frame"].astype(int)
df_scenes["onset_frame"] = df_scenes["onset_frame"].astype(int)
df_scenes["offset_frame"] = df_scenes["offset_frame"].astype(int)
Merge both data set with the columns we want¶
# Ensure episode_frame and onset_frame are sorted
df_max_peak = df_max_peak.sort_values(by=["episode", "episode_frame"])
df_scenes = df_scenes.sort_values(by=["episode", "onset_frame"])
# Initialize an empty list to store results
results = []
# Iterate over each episode to merge
for episode in df_max_peak["episode"].unique():
# Filter data for the current episode
peaks_ep = df_max_peak[df_max_peak["episode"] == episode].copy()
scenes_ep = df_scenes[df_scenes["episode"] == episode][
["scene_number", "onset_frame", "offset_frame", "global_scene_number"]
].copy()
# Merge using merge_asof
merged = pd.merge_asof(
peaks_ep.sort_values("episode_frame"),
scenes_ep.sort_values("onset_frame"),
left_on="episode_frame",
right_on="onset_frame",
direction="backward"
)
# Filter merged results to keep only rows where episode_frame <= end_frame
merged = merged[merged["episode_frame"] <= merged["offset_frame"]]
# Append merged result to the results list
results.append(merged)
# Combine all episode results into one DataFrame
df_result = pd.concat(results, ignore_index=True)
testing to see if merge worked¶
# Filter out frames that are outside the scene range
df_filtered = df_result[
(df_result["episode_frame"] >= df_result["onset_frame"]) &
(df_result["episode_frame"] <= df_result["offset_frame"])
]
# Check how many rows are left (should match the total number of frames if everything is correct)
print(f"Number of correctly matched frames: {len(df_filtered)}")
print(f"Number of frames initially: {len(df_result)}")
# If the number of rows doesn't match, there are frames that are not correctly matched
Number of correctly matched frames: 5957073
Number of frames initially: 5957073
# Total number of frames
n_total_frames = len(df_max_peak)
# Number of frames assigned to scenes
n_assigned_frames = len(df_result)
print(f"Total frames: {n_total_frames}")
print(f"Frames assigned to a scene: {n_assigned_frames}")
print(f"Dropped frames: {n_total_frames - n_assigned_frames}")
Total frames: 7178296
Frames assigned to a scene: 5957073
Dropped frames: 1221223
df_scenes["next_onset"] = df_scenes.groupby("episode")["onset_frame"].shift(-1)
df_scenes["gap"] = df_scenes["next_onset"] - df_scenes["offset_frame"] - 1
df_scenes[df_scenes["gap"] > 0]
Loading...
for ep in df_max_peak["episode"].unique():
first_frame = df_max_peak[df_max_peak["episode"] == ep]["episode_frame"].min()
first_onset = df_scenes[df_scenes["episode"] == ep]["onset_frame"].min()
if first_frame < first_onset:
print(f"Episode {ep} has frames before first scene: frame {first_frame} < onset {first_onset}")
last_scene_offsets = df_scenes.groupby("episode")["offset_frame"].max()
episode_max_frames = df_max_peak.groupby("episode")["episode_frame"].max()
for ep in df_max_peak["episode"].unique():
max_offset = last_scene_offsets.get(ep, None)
max_frame = episode_max_frames.get(ep, None)
if max_offset is not None and max_frame is not None:
if max_offset < max_frame:
print(f"Episode {ep} last scene offset {max_offset} < max frame {max_frame}")
df_max_peak["episode"] = df_max_peak["episode"].astype(str).str.strip()
df_scenes["episode"] = df_scenes["episode"].astype(str).str.strip()
results = []
for episode in df_max_peak["episode"].unique():
peaks_ep = df_max_peak[df_max_peak["episode"] == episode].copy()
scenes_ep = df_scenes[df_scenes["episode"] == episode][
["scene_number", "onset_frame", "offset_frame", "global_scene_number"]
].copy()
# Check frame ranges
min_frame = peaks_ep["episode_frame"].min()
max_frame = peaks_ep["episode_frame"].max()
min_onset = scenes_ep["onset_frame"].min()
max_offset = scenes_ep["offset_frame"].max()
print(f"Episode: {episode}")
print(f"Frame range: {min_frame} to {max_frame}")
print(f"Scene coverage: {min_onset} to {max_offset}")
merged = pd.merge_asof(
peaks_ep.sort_values("episode_frame"),
scenes_ep.sort_values("onset_frame"),
left_on="episode_frame",
right_on="onset_frame",
direction="backward",
allow_exact_matches=True,
)
# Keep only frames within scene offset
merged_filtered = merged[merged["episode_frame"] <= merged["offset_frame"]]
print(f"Total frames: {len(peaks_ep)}")
print(f"Frames assigned after merge: {len(merged_filtered)}")
print(f"Frames dropped: {len(peaks_ep) - len(merged_filtered)}\n")
results.append(merged_filtered)
df_result = pd.concat(results, ignore_index=True)
Fetching long content....
Save the merge dataset¶
#df_result.to_csv("../output/Peak_scenes_merged.tsv", sep="\t", index=False)