Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
Original file line number Diff line number Diff line change
@@ -0,0 +1,99 @@
/*
* Copyright 2026 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.adk.tools.computeruse;

import com.google.adk.tools.Annotations.Schema;
import io.reactivex.rxjava3.core.Completable;
import io.reactivex.rxjava3.core.Single;
import java.time.Duration;
import java.util.List;

/**
* Defines an interface for computer environments.
*
* <p>This interface defines the standard methods for controlling computer environments, including
* web browsers and other interactive systems.
*/
public interface BaseComputer {

/** Returns the screen size of the environment. */
Single<int[]> screenSize();

/** Opens the web browser. */
Single<ComputerState> openWebBrowser();

/** Clicks at a specific x, y coordinate on the webpage. */
Single<ComputerState> clickAt(@Schema(name = "x") int x, @Schema(name = "y") int y);

/** Hovers at a specific x, y coordinate on the webpage. */
Single<ComputerState> hoverAt(@Schema(name = "x") int x, @Schema(name = "y") int y);

/** Types text at a specific x, y coordinate. */
Single<ComputerState> typeTextAt(
@Schema(name = "x") int x,
@Schema(name = "y") int y,
@Schema(name = "text") String text,
@Schema(name = "press_enter", optional = true) Boolean pressEnter,
@Schema(name = "clear_before_typing", optional = true) Boolean clearBeforeTyping);

/** Scrolls the entire webpage in a direction. */
Single<ComputerState> scrollDocument(@Schema(name = "direction") String direction);

/** Scrolls at a specific x, y coordinate by magnitude. */
Single<ComputerState> scrollAt(
@Schema(name = "x") int x,
@Schema(name = "y") int y,
@Schema(name = "direction") String direction,
@Schema(name = "magnitude") int magnitude);

/** Waits for specified duration. */
Single<ComputerState> wait(@Schema(name = "duration") Duration duration);

/** Navigates back. */
Single<ComputerState> goBack();

/** Navigates forward. */
Single<ComputerState> goForward();

/** Jumps to search. */
Single<ComputerState> search();

/** Navigates to URL. */
Single<ComputerState> navigate(@Schema(name = "url") String url);

/** Presses key combination. */
Single<ComputerState> keyCombination(@Schema(name = "keys") List<String> keys);

/** Drag and drop. */
Single<ComputerState> dragAndDrop(
@Schema(name = "x") int x,
@Schema(name = "y") int y,
@Schema(name = "destination_x") int destinationX,
@Schema(name = "destination_y") int destinationY);

/** Returns current state. */
Single<ComputerState> currentState();

/** Initialize the computer. */
Completable initialize();

/** Cleanup resources. */
Completable close();

/** Returns the environment. */
Single<ComputerEnvironment> environment();
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,23 @@
/*
* Copyright 2026 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.adk.tools.computeruse;

/** Enum for computer environments. */
public enum ComputerEnvironment {
ENVIRONMENT_UNSPECIFIED,
ENVIRONMENT_BROWSER
}
108 changes: 108 additions & 0 deletions core/src/main/java/com/google/adk/tools/computeruse/ComputerState.java
Original file line number Diff line number Diff line change
@@ -0,0 +1,108 @@
/*
* Copyright 2026 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.adk.tools.computeruse;

import com.fasterxml.jackson.annotation.JsonCreator;
import com.fasterxml.jackson.annotation.JsonProperty;
import com.google.errorprone.annotations.CanIgnoreReturnValue;
import java.util.Arrays;
import java.util.Objects;
import java.util.Optional;

/**
* Represents the current state of the computer environment.
*
* <p>Attributes: screenshot: The screenshot in PNG format as bytes. url: The current URL of the
* webpage being displayed.
*/
public final class ComputerState {
private final byte[] screenshot;
private final Optional<String> url;

@JsonCreator
private ComputerState(
@JsonProperty("screenshot") byte[] screenshot, @JsonProperty("url") Optional<String> url) {
this.screenshot = screenshot.clone();
this.url = url;
}

@JsonProperty("screenshot")
public byte[] screenshot() {
return screenshot.clone();
}

@JsonProperty("url")
public Optional<String> url() {
return url;
}

public static Builder builder() {
return new Builder();
}

/** Builder for {@link ComputerState}. */
public static final class Builder {
private byte[] screenshot;
private Optional<String> url = Optional.empty();

@CanIgnoreReturnValue
public Builder screenshot(byte[] screenshot) {
this.screenshot = screenshot.clone();
return this;
}

@CanIgnoreReturnValue
public Builder url(Optional<String> url) {
this.url = url;
return this;
}

@CanIgnoreReturnValue
public Builder url(String url) {
this.url = Optional.ofNullable(url);
return this;
}

public ComputerState build() {
return new ComputerState(screenshot, url);
}
}

public static ComputerState create(byte[] screenshot, String url) {
return builder().screenshot(screenshot).url(url).build();
}

public static ComputerState create(byte[] screenshot) {
return builder().screenshot(screenshot).build();
}

@Override
public boolean equals(Object o) {
if (this == o) {
return true;
}
if (!(o instanceof ComputerState that)) {
return false;
}
return Objects.deepEquals(screenshot, that.screenshot) && Objects.equals(url, that.url);
}

@Override
public int hashCode() {
return Objects.hash(Arrays.hashCode(screenshot), url);
}
}
Original file line number Diff line number Diff line change
@@ -0,0 +1,118 @@
/*
* Copyright 2026 Google LLC
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
* You may obtain a copy of the License at
*
* http://www.apache.org/licenses/LICENSE-2.0
*
* Unless required by applicable law or agreed to in writing, software
* distributed under the License is distributed on an "AS IS" BASIS,
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
* See the License for the specific language governing permissions and
* limitations under the License.
*/

package com.google.adk.tools.computeruse;

import com.google.adk.tools.FunctionTool;
import com.google.adk.tools.ToolContext;
import com.google.common.collect.ImmutableMap;
import io.reactivex.rxjava3.core.Single;
import java.lang.reflect.Method;
import java.util.Base64;
import java.util.HashMap;
import java.util.Map;
import org.slf4j.Logger;
import org.slf4j.LoggerFactory;

/**
* A tool that wraps computer control functions for use with LLMs.
*
* <p>This tool automatically normalizes coordinates from a virtual coordinate space (by default
* 1000x1000) to the actual screen size.
*/
public class ComputerUseTool extends FunctionTool {

private static final Logger logger = LoggerFactory.getLogger(ComputerUseTool.class);

private final int[] screenSize;
private final int[] coordinateSpace;

public ComputerUseTool(Object instance, Method func, int[] screenSize, int[] virtualScreenSize) {
super(instance, func, /* isLongRunning= */ false);
this.screenSize = screenSize;
this.coordinateSpace = virtualScreenSize;
}

private int normalizeX(Object xObj) {
if (!(xObj instanceof Number number)) {
throw new IllegalArgumentException("x coordinate must be numeric");
}
double x = number.doubleValue();
int normalized = (int) (x / coordinateSpace[0] * screenSize[0]);
// Clamp to screen bounds
int clamped = Math.max(0, Math.min(normalized, screenSize[0] - 1));
logger.atDebug().log(
"x: %.2f, normalized x: %d, screen width: %d, coordinate space width: %d, clamped x: %d",
x, normalized, screenSize[0], coordinateSpace[0], clamped);
return clamped;
}

private int normalizeY(Object yObj) {
if (!(yObj instanceof Number number)) {
throw new IllegalArgumentException("y coordinate must be numeric");
}
double y = number.doubleValue();
int normalized = (int) (y / coordinateSpace[1] * screenSize[1]);
// Clamp to screen bounds
int clamped = Math.max(0, Math.min(normalized, screenSize[1] - 1));
logger.atDebug().log(
"y: %.2f, normalized y: %d, screen height: %d, coordinate space height: %d, clamped y: %d",
y, normalized, screenSize[1], coordinateSpace[1], clamped);
return clamped;
}

@Override
public Single<Map<String, Object>> runAsync(Map<String, Object> args, ToolContext toolContext) {
Map<String, Object> normalizedArgs = new HashMap<>(args);

if (args.containsKey("x")) {
normalizedArgs.put("x", normalizeX(args.get("x")));
}
if (args.containsKey("y")) {
normalizedArgs.put("y", normalizeY(args.get("y")));
}
if (args.containsKey("destination_x")) {
normalizedArgs.put("destination_x", normalizeX(args.get("destination_x")));
}
if (args.containsKey("destination_y")) {
normalizedArgs.put("destination_y", normalizeY(args.get("destination_y")));
}

return super.runAsync(normalizedArgs, toolContext)
.map(
result -> {
// If the underlying tool method returned a structure containing a "screenshot" field
// (e.g., a ComputerState object), FunctionTool.runAsync will have converted it to a
// Map. This post-processing step transforms the byte array "screenshot" field into
// an "image" map with a mimetype and Base64 encoded data, as expected by some
// consuming systems.
if (result.containsKey("screenshot") && result.get("screenshot") instanceof byte[]) {
byte[] screenshot = (byte[]) result.get("screenshot");
ImmutableMap<String, Object> imageMap =
ImmutableMap.of(
"mimetype",
"image/png",
"data",
Base64.getEncoder().encodeToString(screenshot));
Map<String, Object> finalResult = new HashMap<>(result);
finalResult.remove("screenshot");
finalResult.put("image", imageMap);
return finalResult;
}
return result;
});
}
}
Loading