-
Notifications
You must be signed in to change notification settings - Fork 1
/
Copy pathreal_world_experiments_utils.py
184 lines (152 loc) · 7.84 KB
/
real_world_experiments_utils.py
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
import numpy as np
import cv2
import os
from utils.gpt_utils import analyze_images_gpt, get_user_prompt, system_prompt, gpt_v_demonstrations, parse_output
import json
from slurm_utils import find_pixel_center_of_cloth, find_corners
from openai import OpenAI
def crop_input_image(input_rgb, input_depth, cropped_depth_image):
'''
This function call will be used for taking the input RGB and Depth images to be cropped
The final saved image should be a depth image with the cloth around the center
Note that this function call returns the pivot pixel coordinate for handling the real pick, place pixels
'''
# Load the image
image = cv2.imread(input_rgb)
depth_image = cv2.imread(input_depth)
pivot_coordinate = np.array([0, 0])
# Crop the image initially since there's a big bounding box present already
height, width = image.shape[:2]
image = image[20:height-20, 20:width-20]
depth_image = depth_image[20:height-20, 20:width-20]
pivot_coordinate += np.array([20, 20])
# Convert the image to grayscale
gray = cv2.cvtColor(image, cv2.COLOR_BGR2GRAY)
# Apply edge detection
edges = cv2.Canny(gray, 50, 150)
# Find contours in the edge-detected image
contours, _ = cv2.findContours(edges, cv2.RETR_EXTERNAL, cv2.CHAIN_APPROX_SIMPLE)
# Calculate the center of the image
center_x = image.shape[1] // 2
center_y = image.shape[0] // 2
# Calculate the distances of contours from the center
distances = []
for contour in contours:
M = cv2.moments(contour)
if M["m00"] != 0:
cX = int(M["m10"] / M["m00"])
cY = int(M["m01"] / M["m00"])
distance = np.sqrt((cX - center_x) ** 2 + (cY - center_y) ** 2)
distances.append((contour, distance))
# Sort the contours by distance from the center
distances.sort(key=lambda x: x[1])
# Select the contour closest to the center as the region of interest (ROI)
closest_contour = distances[0][0]
# Get the bounding box of the closest contour
x, y, w, h = cv2.boundingRect(closest_contour)
# Add padding around the bounding box
padding = 40
x -= padding
y -= padding
w += 2 * padding
h += 2 * padding
# Ensure the bounding box stays within the image bounds
x = max(0, x)
y = max(0, y)
w = min(image.shape[1] - x, w)
h = min(image.shape[0] - y, h)
# Crop the region of interest with padding
cropped_image = image[y:y+h, x:x+w]
image[y,x] = (255, 0, 0)
pivot_coordinate += np.array([x, y])
# Display the cropped image to see if it's all working alright :)
cv2.imshow('Cropped Image', cropped_image)
cv2.waitKey(0)
cv2.destroyAllWindows()
# Now instead performing cropping with the depth image
depth_image = depth_image[y:y+h, x:x+w]
cv2.imwrite(cropped_depth_image, depth_image)
# Return the pivot pixel coordinates to be used later
return pivot_coordinate
def get_initial_cloth_center(initial_input_rgb, initial_input_depth, initial_cropped_depth_image):
'''
This function takes the RGB and Depth images of the initial cloth configuration and returns the actual pixel coordinate for the center
'''
# First crop the initial cloth images to get the new depth image
initial_pivot_coordinate = crop_input_image(initial_input_rgb, initial_input_depth, initial_cropped_depth_image)
# Call the function to get the center for the saved initial cropped depth image
initial_local_cloth_center = find_pixel_center_of_cloth(initial_cropped_depth_image, False)
# Add the pivot coordinate to return the true coordinates of the cloth center
return initial_pivot_coordinate + np.array(initial_local_cloth_center)
def gpt_for_real_world(input_rgb, input_depth, cropped_depth_image, cloth_center, task, current_step):
'''
This function call will be used for calling the overall GPT pipeline for the real-world experiments
Args:
input_rgb: The file path corresponding to the RGB image of the current cloth configuration
input_depth: The file path corresponding to the depth image of the current cloth configuration
cropped_depth_image: The file path where the cropped depth image is expected to be saved
cloth_center: The pixel coordinates for the initial cloth center in the actual image
task: The folding task that we wish to perform. Use one of DoubleTriangle, AllCornersInward, CornersEdgesInward, DoubleStraight
current_step: The number of folding steps executed for the current test case thus far (starts with 0)
'''
# Setting up the chain to interact with OpenAI. Using Daniel's API key for now
# TODO: Use your own API key for performing the experiments
client = OpenAI(api_key="api_key")
# IMPORTANT - This model will get deprecated on Dec 6, 2024. Kindly use any newer OpenAI models with vision reasoning abilities
gpt_vision_model = "gpt-4-vision-preview"
# Crop the input RGB and Depth images to get the cropped version of theirs
pivot_coordinate = crop_input_image(input_rgb, input_depth, cropped_depth_image)
# Get the local cloth center for the current image configuration
cloth_center = cloth_center - pivot_coordinate
# Get the cloth corners for the current cloth configuration
cloth_corners = find_corners(cropped_depth_image, False)
# Getting the template folding instruction images from the demonstrations
demo_root_path = os.path.join("data", "demo", task, "rgbviz")
start_image = os.path.join(demo_root_path, str(current_step) + ".png")
last_image = os.path.join(demo_root_path, str(current_step+1) + ".png")
instruction = analyze_images_gpt([start_image, last_image], task, current_step, "in-context", gpt_vision_model)
# Getting the user prompt based on the information that we have so far
user_prompt = get_user_prompt(cloth_corners, cloth_center, True, instruction, task, None)
print("The user prompt was: ", user_prompt)
# Getting the demonstrations that would be used for the specific task
indices = gpt_v_demonstrations["in-context"][task]["gpt-demonstrations"]
# Imp: The information corresponding to the demonstrations is assumed to be in utils folder
demonstration_dictionary_list = []
gpt_demonstrations_path = os.path.join("utils", "gpt-demonstrations", task, "demonstrations.json")
with open(gpt_demonstrations_path, 'r') as f:
gpt_demonstrations = json.load(f)
# Fetching the information from the demonstrations as In-context data
for index in indices:
step_dictionary = gpt_demonstrations[str(index)][str(current_step + 1)]
user_prompt_dictionary = {
"role": "user",
"content": step_dictionary["user-prompt"]
}
assistant_response_dictionary = {
"role": "assistant",
"content": step_dictionary["assistant-response"]
}
demonstration_dictionary_list += [user_prompt_dictionary, assistant_response_dictionary]
# Making a call to the OpenAI API after we have the demonstrations
response = client.chat.completions.create(
model="gpt-3.5-turbo-0125",
messages=[
{
"role": "system",
"content": system_prompt
}] + demonstration_dictionary_list +
[{
"role": "user",
"content": user_prompt
}],
temperature=0,
max_tokens=769,
top_p=1,
frequency_penalty=0,
presence_penalty=0
)
# Parsing the above output to get the pixel coorindates for pick and place
test_pick_pixel, test_place_pixel = parse_output(response.choices[0].message.content)
print("The system response was: ", response.choices[0].message.content)
# Returning the pick and the place point after adjusting with the pivot coordinate
return pivot_coordinate + test_pick_pixel, pivot_coordinate + test_place_pixel