Skip to content
New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Add ten/five crop augmentation #110

Open
wants to merge 1 commit into
base: master
Choose a base branch
from
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
4 changes: 4 additions & 0 deletions configs/i3d.yml
Original file line number Diff line number Diff line change
Expand Up @@ -6,9 +6,13 @@ streams: null # Streams to use for feature extraction (e.g. 'rgb' or 'flow'). Bo
flow_type: 'pwc' # Flow to use in I3D. 'pwc' (PWCNet) is faster while 'raft' (RAFT) is more accurate.
extraction_fps: null # For original video fps, leave as "null" (None)

# Feature Augumentation
augment: null # Augmentation to use for feature extraction. Can be ['ten_crop', 'five_crop']

# Extraction Parameters
device: 'cuda:0' # device as in `torch`, can be 'cpu'
on_extraction: 'print' # what to do once the features are extracted. Can be ['print', 'save_numpy', 'save_pickle']
save_option: null # choose what to save. If you only want rgb features, choose 'rgb_only' else null
output_path: './output' # where to store results if saved
tmp_path: './tmp' # folder to store the temporary files used for extraction (frames or aud files)
keep_tmp_files: false # to keep temp files after feature extraction.
Expand Down
1 change: 1 addition & 0 deletions docs/models/i3d.md
Original file line number Diff line number Diff line change
Expand Up @@ -44,6 +44,7 @@ You may test it yourself by providing `--show_pred` flag.
| `video_paths` | `null` | A list of videos for feature extraction. E.g. `"[./sample/v_ZNVhz7ctTq0.mp4, ./sample/v_GGSY1Qvo990.mp4]"` or just one path `"./sample/v_GGSY1Qvo990.mp4"`. |
| `file_with_video_paths` | `null` | A path to a text file with video paths (one path per line). Hint: given a folder `./dataset` with `.mp4` files one could use: `find ./dataset -name "*mp4" > ./video_paths.txt`. |
| `on_extraction` | `print` | If `print`, the features are printed to the terminal. If `save_numpy` or `save_pickle`, the features are saved to either `.npy` file or `.pkl`. |
| `save_option` | `null` | If `rgb_only`, only the rgb features will be saved. |
| `output_path` | `"./output"` | A path to a folder for storing the extracted features (if `on_extraction` is either `save_numpy` or `save_pickle`). |
| `keep_tmp_files` | `false` | If `true`, the reencoded videos will be kept in `tmp_path`. |
| `tmp_path` | `"./tmp"` | A path to a folder for storing temporal files (e.g. reencoded videos). |
Expand Down
22 changes: 18 additions & 4 deletions models/_base/base_extractor.py
Original file line number Diff line number Diff line change
Expand Up @@ -18,9 +18,11 @@ def __init__(self,
output_path: str,
keep_tmp_files: bool,
device: str,
save_option=None,
) -> None:
self.feature_type = feature_type
self.on_extraction = on_extraction
self.save_option = save_option
self.tmp_path = tmp_path
self.output_path = output_path
self.keep_tmp_files = keep_tmp_files
Expand Down Expand Up @@ -76,6 +78,11 @@ def action_on_extraction(
return

for key, value in feats_dict.items():
if self.save_option == 'rgb_only':
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

what's wrong with the streams argument in i3d?

if key != 'rgb':
continue
else:
key = None
if self.on_extraction == 'print':
print(key)
print(value)
Expand All @@ -84,11 +91,18 @@ def action_on_extraction(
elif self.on_extraction in ['save_numpy', 'save_pickle']:
# make dir if doesn't exist
os.makedirs(self.output_path, exist_ok=True)
fpath = make_path(self.output_path, video_path, key, action2ext[self.on_extraction])
if key != 'fps' and len(value) == 0:
print(f'Warning: the value is empty for {key} @ {fpath}')
# save the info behind the each key
action2savefn[self.on_extraction](fpath, value)
if len(value.shape) < 3:
fpath = make_path(self.output_path, video_path, key, action2ext[self.on_extraction])
if key != 'fps' and len(value) == 0:
print(f'Warning: the value is empty for {key} @ {fpath}')
action2savefn[self.on_extraction](fpath, value)
else:
for i in range(value.shape[0]):
fpath = make_path(self.output_path, video_path, key, action2ext[self.on_extraction], i)
if key != 'fps' and len(value) == 0:
print(f'Warning: the value is empty for {key} @ {fpath}')
action2savefn[self.on_extraction](fpath, value[i, :])
else:
raise NotImplementedError(f'on_extraction: {self.on_extraction} is not implemented')

Expand Down
33 changes: 27 additions & 6 deletions models/i3d/extract_i3d.py
Original file line number Diff line number Diff line change
Expand Up @@ -24,6 +24,7 @@ def __init__(self, args) -> None:
super().__init__(
feature_type=args.feature_type,
on_extraction=args.on_extraction,
save_option=args.save_option,
tmp_path=args.tmp_path,
output_path=args.output_path,
keep_tmp_files=args.keep_tmp_files,
Expand All @@ -38,15 +39,24 @@ def __init__(self, args) -> None:
self.extraction_fps = args.extraction_fps
self.step_size = 64 if args.step_size is None else args.step_size
self.stack_size = 64 if args.stack_size is None else args.stack_size
self.aug_type = args.augment
self.resize_transforms = torchvision.transforms.Compose([
torchvision.transforms.ToPILImage(),
ResizeImproved(self.min_side_size),
PILToTensor(),
ToFloat(),
])
if self.aug_type is None:
aug_transform = TensorCenterCrop(self.central_crop_size)
elif self.aug_type == 'five_crop':
aug_transform = torchvision.transforms.FiveCrop(self.central_crop_size)
self.num_crop = 5
elif self.aug_type == 'ten_crop':
aug_transform = torchvision.transforms.TenCrop(self.central_crop_size)
self.num_crop = 10
self.i3d_transforms = {
'rgb': torchvision.transforms.Compose([
TensorCenterCrop(self.central_crop_size),
aug_transform,
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

any reason why we can't do it for the flow?

ScaleTo1_1(),
PermuteAndUnsqueeze()
]),
Expand Down Expand Up @@ -82,8 +92,12 @@ def extract(self, video_path: str) -> Dict[str, np.ndarray]:
# timestamp when the last frame in the stack begins (when the old frame of the last pair ends)
timestamps_ms = []
rgb_stack = []
feats_dict = {stream: [] for stream in self.streams}


if self.aug_type is not None:
feats_dict = {stream: [[] for _ in range(self.num_crop)] for stream in self.streams}
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Why treat each crop as a separate tensor instead of a batch dimension: B, Crops, D --> B*Crops, D?

else:
feats_dict = {stream: [] for stream in self.streams}

# sometimes when the target fps is 1 or 2, the first frame of the reencoded video is missing
# and cap.read returns None but the rest of the frames are ok. timestep is 0.0 for the 2nd frame in
# this case
Expand Down Expand Up @@ -113,7 +127,11 @@ def extract(self, video_path: str) -> Dict[str, np.ndarray]:
if len(rgb_stack) - 1 == self.stack_size:
batch_feats_dict = self.run_on_a_stack(rgb_stack, stack_counter, padder)
for stream in self.streams:
feats_dict[stream].extend(batch_feats_dict[stream].tolist())
if isinstance(batch_feats_dict[stream], tuple):
for i in range(len(batch_feats_dict[stream])):
feats_dict[stream][i].extend(batch_feats_dict[stream][i].tolist())
else:
feats_dict[stream].extend(batch_feats_dict[stream].tolist())
# leaving the elements if step_size < stack_size so they will not be loaded again
# if step_size == stack_size one element is left because the flow between the last element
# in the prev list and the first element in the current list
Expand Down Expand Up @@ -161,8 +179,11 @@ def run_on_a_stack(self, rgb_stack, stack_counter, padder=None) -> Dict[str, tor
raise NotImplementedError
# apply transforms depending on the stream (flow or rgb)
stream_slice = self.i3d_transforms[stream](stream_slice)
# extract features for a stream
batch_feats_dict[stream] = models[stream](stream_slice, features=True) # (B, 1024)
if isinstance(stream_slice, tuple):
# extract features for a stream
batch_feats_dict[stream] = tuple([models[stream](stream_crop, features=True) for stream_crop in stream_slice])
else:
batch_feats_dict[stream] = models[stream](stream_slice, features=True) # (B, 1024)
# add features to the output dict
self.maybe_show_pred(stream_slice, self.name2module['model'][stream], stack_counter)

Expand Down
8 changes: 6 additions & 2 deletions models/transforms.py
Original file line number Diff line number Diff line change
Expand Up @@ -145,13 +145,17 @@ def __call__(self, tensor: torch.FloatTensor) -> torch.FloatTensor:

class ScaleTo1_1(object):

def __call__(self, tensor: torch.FloatTensor) -> torch.FloatTensor:
def __call__(self, tensor):
if isinstance(tensor, tuple):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lost typing

return tuple([(2 * t / 255) - 1 for t in tensor])
return (2 * tensor / 255) - 1


class PermuteAndUnsqueeze(object):

def __call__(self, tensor: torch.FloatTensor) -> torch.FloatTensor:
def __call__(self, tensor):
if isinstance(tensor, tuple):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

lost typing

return tuple([t.permute(1, 0, 2, 3).unsqueeze(0) for t in tensor])
return tensor.permute(1, 0, 2, 3).unsqueeze(0)


Expand Down
16 changes: 12 additions & 4 deletions utils/utils.py
Original file line number Diff line number Diff line change
Expand Up @@ -50,9 +50,18 @@ def show_predictions_on_dataset(logits: torch.FloatTensor, dataset: Union[str, L
print(f'{logit:8.3f} | {smax:.3f} | {cls}')
print()

def make_path(output_root, video_path, output_key, ext):
def make_path(output_root, video_path, output_key, ext, idx=None):
Copy link
Owner

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

we shouldn't resort to this. it became incredibly redundant. we need to save all features in one file

# extract file name and change the extention
fname = f'{Path(video_path).stem}_{output_key}{ext}'
if idx is not None:
if output_key is not None:
fname = f'{Path(video_path).stem}_{output_key}_{idx}{ext}'
else:
fname = f'{Path(video_path).stem}_{idx}{ext}'
else:
if output_key is not None:
fname = f'{Path(video_path).stem}_{output_key}{ext}'
else:
fname = f'{Path(video_path).stem}_{idx}{ext}'
# construct the paths to save the features
return os.path.join(output_root, fname)

Expand Down Expand Up @@ -131,8 +140,7 @@ def form_list_from_user_input(
to_shuffle: bool = True,
) -> list:
'''User specifies either list of videos in the cmd or a path to a file with video paths. This function
transforms the user input into a list of paths. Files are expected to be formatted with a single
video-path in each line.
transforms the user input into a list of paths.

Args:
video_paths (Union[str, ListConfig, None], optional): a list of video paths. Defaults to None.
Expand Down