Use shell-first Genie app driving

Co-authored-by: Codex <noreply@openai.com>
This commit is contained in:
Iliyan Malchev
2026-03-19 14:23:45 -07:00
parent c2c663a6e5
commit 7b8096e6ef
7 changed files with 16 additions and 439 deletions

View File

@@ -1,37 +1,21 @@
package com.openai.codex.genie
import android.app.agent.GenieService
import android.content.ComponentName
import android.content.Context
import android.content.Intent
import android.graphics.Bitmap
import android.util.Base64
import java.io.ByteArrayOutputStream
import java.io.File
import java.io.IOException
import java.util.concurrent.TimeUnit
import org.json.JSONObject
class AndroidGenieToolExecutor(
private val context: Context,
private val callback: GenieService.Callback,
private val sessionId: String,
private val defaultTargetPackage: String,
) {
companion object {
private const val INPUT_BIN = "/system/bin/input"
private const val UIAUTOMATOR_BIN = "/system/bin/uiautomator"
private const val SHELL_TIMEOUT_MS = 5_000L
private const val MAX_UI_XML_CHARS = 8_000
}
fun execute(
toolName: String,
arguments: JSONObject,
@Suppress("UNUSED_PARAMETER") arguments: JSONObject,
): GenieToolObservation {
return when (toolName) {
"android.package.inspect" -> inspectPackage(arguments)
"android.intent.launch" -> launchIntent(arguments)
"android.target.show" -> requestTargetVisibility(
action = "show",
request = callback::requestShowDetachedTarget,
@@ -49,54 +33,10 @@ class AndroidGenieToolExecutor(
request = callback::requestCloseDetachedTarget,
)
"android.target.capture_frame" -> captureDetachedTargetFrame()
"android.ui.dump" -> dumpUiHierarchy()
"android.input.tap" -> tap(arguments)
"android.input.text" -> inputText(arguments)
"android.input.key" -> inputKey(arguments)
"android.wait" -> waitFor(arguments)
else -> throw IOException("Unknown tool: $toolName")
}
}
private fun inspectPackage(arguments: JSONObject): GenieToolObservation {
val packageName = resolvePackageName(arguments)
val targetApp = TargetAppInspector.inspect(context, packageName)
return GenieToolObservation(
name = "android.package.inspect",
summary = "Inspected ${targetApp.displayName()} ($packageName).",
promptDetails = targetApp.renderPromptSection(),
)
}
private fun launchIntent(arguments: JSONObject): GenieToolObservation {
val packageName = resolvePackageName(arguments)
val componentName = arguments.optString("component").trim()
val action = arguments.optString("action").trim()
val intent = when {
componentName.isNotEmpty() -> Intent().apply {
component = ComponentName.unflattenFromString(componentName)
?: throw IOException("Invalid component: $componentName")
}
action.isNotEmpty() -> Intent(action).apply {
`package` = packageName
}
else -> context.packageManager.getLaunchIntentForPackage(packageName)
?: throw IOException("No launch intent for $packageName")
}
intent.addFlags(Intent.FLAG_ACTIVITY_NEW_TASK)
context.startActivity(intent)
return GenieToolObservation(
name = "android.intent.launch",
summary = "Launched target intent for $packageName.",
promptDetails = buildString {
appendLine("Launched target app.")
appendLine("- package: $packageName")
appendLine("- action: ${if (action.isNotEmpty()) action else intent.action ?: "default"}")
append("- component: ${intent.component?.flattenToShortString() ?: "default launcher"}")
},
)
}
private fun requestTargetVisibility(
action: String,
request: (String) -> Unit,
@@ -132,95 +72,4 @@ class AndroidGenieToolExecutor(
),
)
}
private fun dumpUiHierarchy(): GenieToolObservation {
val outputFile = File(context.cacheDir, "genie-ui-$sessionId.xml")
val commandOutput = runCommand(listOf(UIAUTOMATOR_BIN, "dump", outputFile.absolutePath))
val xml = outputFile.readText()
val trimmedXml = if (xml.length > MAX_UI_XML_CHARS) {
"${xml.take(MAX_UI_XML_CHARS)}\n...[truncated]"
} else {
xml
}
return GenieToolObservation(
name = "android.ui.dump",
summary = "Dumped UI hierarchy (${xml.length} chars).",
promptDetails = buildString {
appendLine("uiautomator dump output:")
appendLine(commandOutput.ifBlank { "(no command output)" })
appendLine()
append(trimmedXml)
},
)
}
private fun tap(arguments: JSONObject): GenieToolObservation {
val x = arguments.optInt("x", Int.MIN_VALUE)
val y = arguments.optInt("y", Int.MIN_VALUE)
if (x == Int.MIN_VALUE || y == Int.MIN_VALUE) {
throw IOException("android.input.tap requires integer x and y")
}
val output = runCommand(listOf(INPUT_BIN, "tap", x.toString(), y.toString()))
return GenieToolObservation(
name = "android.input.tap",
summary = "Sent tap at ($x, $y).",
promptDetails = "Executed input tap at ($x, $y).\n${output.ifBlank { "Command output: (none)" }}",
)
}
private fun inputText(arguments: JSONObject): GenieToolObservation {
val text = arguments.optString("text").takeIf(String::isNotBlank)
?: throw IOException("android.input.text requires non-empty text")
val escapedText = text.replace(" ", "%s")
val output = runCommand(listOf(INPUT_BIN, "text", escapedText))
return GenieToolObservation(
name = "android.input.text",
summary = "Sent text input (${text.length} chars).",
promptDetails = "Executed input text for ${text.length} characters.\n${output.ifBlank { "Command output: (none)" }}",
)
}
private fun inputKey(arguments: JSONObject): GenieToolObservation {
val key = arguments.optString("key").takeIf(String::isNotBlank)
?: throw IOException("android.input.key requires key")
val output = runCommand(listOf(INPUT_BIN, "keyevent", key))
return GenieToolObservation(
name = "android.input.key",
summary = "Sent key event $key.",
promptDetails = "Executed input keyevent $key.\n${output.ifBlank { "Command output: (none)" }}",
)
}
private fun waitFor(arguments: JSONObject): GenieToolObservation {
val millis = arguments.optLong("millis", -1L)
if (millis <= 0L || millis > 10_000L) {
throw IOException("android.wait requires millis in 1..10000")
}
Thread.sleep(millis)
return GenieToolObservation(
name = "android.wait",
summary = "Waited ${millis}ms.",
promptDetails = "Paused execution for ${millis}ms.",
)
}
private fun resolvePackageName(arguments: JSONObject): String {
return arguments.optString("packageName").takeIf(String::isNotBlank) ?: defaultTargetPackage
}
private fun runCommand(command: List<String>): String {
val process = ProcessBuilder(command)
.redirectErrorStream(true)
.start()
if (!process.waitFor(SHELL_TIMEOUT_MS, TimeUnit.MILLISECONDS)) {
process.destroyForcibly()
throw IOException("Timed out: ${command.joinToString(" ")}")
}
val output = process.inputStream.bufferedReader().use { it.readText() }.trim()
if (process.exitValue() != 0) {
val detail = output.ifBlank { "exit ${process.exitValue()}" }
throw IOException("${command.joinToString(" ")} failed: $detail")
}
return output
}
}

View File

@@ -23,7 +23,6 @@ class CodexAppServerHost(
private val control: GenieSessionControl,
private val bridgeClient: AgentBridgeClient,
private val runtimeStatus: CodexAgentBridge.RuntimeStatus,
private val targetAppContext: TargetAppContext?,
) : Closeable {
companion object {
private const val TAG = "CodexAppServerHost"
@@ -241,10 +240,8 @@ class CodexAppServerHost(
val toolName = params.optString("tool").trim()
val arguments = params.optJSONObject("arguments") ?: JSONObject()
val toolExecutor = AndroidGenieToolExecutor(
context = context,
callback = callback,
sessionId = request.sessionId,
defaultTargetPackage = request.targetPackage,
)
val observation = runCatching {
toolExecutor.execute(toolName, arguments)
@@ -467,7 +464,8 @@ class CodexAppServerHost(
You are Codex acting as a child Android Genie bound to ${request.targetPackage}.
The user interacts only with the supervising Agent.
Decide your own local plan and choose tools yourself.
Prefer the Android dynamic tools for observing and driving the target app.
Use normal Android shell commands for package discovery, activity launch, input injection, UI dumping, and screenshots whenever those commands are available.
Use Android dynamic tools only for framework-only detached target operations that do not have equivalent shell commands.
If you need clarification or a decision from the supervising Agent, call request_user_input with concise free-form question text.
Do not use hidden control protocols.
Finish with a normal assistant message describing what you accomplished or what blocked you.
@@ -477,97 +475,22 @@ class CodexAppServerHost(
}
private fun buildDelegatedPrompt(): String {
val targetSection = targetAppContext?.renderPromptSection()
?: "Target app inspection:\n- unavailable"
return """
Target package:
${request.targetPackage}
Delegated objective:
${request.prompt}
$targetSection
""".trimIndent()
}
private fun buildDynamicToolSpecs(): JSONArray {
return JSONArray()
.put(
dynamicToolSpec(
name = "android.package.inspect",
description = "Inspect package metadata for the paired Android target app.",
inputSchema = objectSchema(
properties = mapOf(
"packageName" to stringSchema("Optional package name override."),
),
),
),
)
.put(
dynamicToolSpec(
name = "android.intent.launch",
description = "Launch the target app or an explicit target activity/intent.",
inputSchema = objectSchema(
properties = mapOf(
"packageName" to stringSchema("Optional package name override."),
"action" to stringSchema("Optional Android intent action."),
"component" to stringSchema("Optional flattened component name."),
),
),
),
)
.put(dynamicToolSpec("android.target.show", "Show the detached target window.", emptyObjectSchema()))
.put(dynamicToolSpec("android.target.hide", "Hide the detached target window.", emptyObjectSchema()))
.put(dynamicToolSpec("android.target.attach", "Reattach the detached target back to the main display.", emptyObjectSchema()))
.put(dynamicToolSpec("android.target.close", "Close the detached target window.", emptyObjectSchema()))
.put(dynamicToolSpec("android.target.capture_frame", "Capture the detached target window as an image.", emptyObjectSchema()))
.put(dynamicToolSpec("android.ui.dump", "Dump the current UI hierarchy via uiautomator.", emptyObjectSchema()))
.put(
dynamicToolSpec(
name = "android.input.tap",
description = "Inject a tap at absolute screen coordinates.",
inputSchema = objectSchema(
properties = mapOf(
"x" to numberSchema("Absolute X coordinate."),
"y" to numberSchema("Absolute Y coordinate."),
),
required = listOf("x", "y"),
),
),
)
.put(
dynamicToolSpec(
name = "android.input.text",
description = "Inject text into the focused field.",
inputSchema = objectSchema(
properties = mapOf(
"text" to stringSchema("Text to type."),
),
required = listOf("text"),
),
),
)
.put(
dynamicToolSpec(
name = "android.input.key",
description = "Inject an Android keyevent by name or keycode token.",
inputSchema = objectSchema(
properties = mapOf(
"key" to stringSchema("Android keyevent token, for example ENTER or BACK."),
),
required = listOf("key"),
),
),
)
.put(
dynamicToolSpec(
name = "android.wait",
description = "Pause briefly to let the UI settle.",
inputSchema = objectSchema(
properties = mapOf(
"millis" to numberSchema("Milliseconds to sleep (1-10000)."),
),
required = listOf("millis"),
),
),
)
}
private fun dynamicToolSpec(
@@ -598,18 +521,6 @@ class CodexAppServerHost(
.put("additionalProperties", false)
}
private fun stringSchema(description: String): JSONObject {
return JSONObject()
.put("type", "string")
.put("description", description)
}
private fun numberSchema(description: String): JSONObject {
return JSONObject()
.put("type", "number")
.put("description", description)
}
private fun buildDynamicToolContentItems(observation: GenieToolObservation): JSONArray {
val items = JSONArray().put(
JSONObject()

View File

@@ -49,23 +49,9 @@ class CodexGenieService : GenieService() {
)
callback.publishTrace(
sessionId,
"Genie is headless. It hosts codex app-server locally, routes model traffic through the Agent Binder bridge, and exposes Android tooling as dynamic tools.",
"Genie is headless. It hosts codex app-server locally, routes model traffic through the Agent Binder bridge, uses normal Android shell commands for package/app driving, and reserves dynamic tools for framework-only target controls.",
)
val targetAppContext = runCatching { TargetAppInspector.inspect(this, request.targetPackage) }
targetAppContext.onSuccess { targetApp ->
callback.publishTrace(
sessionId,
"Inspected target app inside the paired sandbox: ${targetApp.describeForTrace()}",
)
}
targetAppContext.onFailure { err ->
callback.publishTrace(
sessionId,
"Target app inspection failed for ${request.targetPackage}: ${err.message}",
)
}
if (request.isDetachedModeAllowed) {
callback.requestLaunchDetachedTargetHidden(sessionId)
callback.publishTrace(sessionId, "Requested detached target launch for ${request.targetPackage}.")
@@ -94,7 +80,6 @@ class CodexGenieService : GenieService() {
control = control,
bridgeClient = bridgeClient,
runtimeStatus = runtimeStatus,
targetAppContext = targetAppContext.getOrNull(),
).use { host ->
host.run()
}

View File

@@ -1,60 +0,0 @@
package com.openai.codex.genie
data class TargetAppContext(
val packageName: String,
val applicationLabel: String?,
val versionName: String?,
val versionCode: Long?,
val launchIntentAction: String?,
val launchIntentComponent: String?,
val requestedPermissions: List<String>,
) {
fun displayName(): String {
return applicationLabel?.takeIf(String::isNotBlank) ?: packageName
}
fun describeForTrace(): String {
val versionSummary = when {
versionName != null && versionCode != null -> "version=$versionName ($versionCode)"
versionName != null -> "version=$versionName"
versionCode != null -> "versionCode=$versionCode"
else -> "version=unknown"
}
val launcherSummary = launchIntentComponent?.let { component ->
val actionSuffix = launchIntentAction?.let { " action=$it" } ?: ""
"launcher=$component$actionSuffix"
} ?: "launcher=unavailable"
val permissionSummary = summarizePermissions(maxVisible = 3)
return "${displayName()} ($packageName), $versionSummary, $launcherSummary, permissions=$permissionSummary"
}
fun renderPromptSection(): String {
val permissions = requestedPermissions.joinToString(separator = "\n") { "- $it" }
.ifBlank { "- none declared or visible" }
return """
Target app inspection:
- package: $packageName
- label: ${displayName()}
- versionName: ${versionName ?: "unknown"}
- versionCode: ${versionCode ?: "unknown"}
- launcherAction: ${launchIntentAction ?: "unavailable"}
- launcherComponent: ${launchIntentComponent ?: "unavailable"}
- requestedPermissions:
$permissions
""".trimIndent()
}
private fun summarizePermissions(maxVisible: Int): String {
if (requestedPermissions.isEmpty()) {
return "none"
}
val visible = requestedPermissions.take(maxVisible)
val summary = visible.joinToString()
val remaining = requestedPermissions.size - visible.size
return if (remaining > 0) {
"$summary (+$remaining more)"
} else {
summary
}
}
}

View File

@@ -1,50 +0,0 @@
package com.openai.codex.genie
import android.content.Context
import android.content.pm.PackageInfo
import android.content.pm.PackageManager
import android.os.Build
object TargetAppInspector {
fun inspect(context: Context, packageName: String): TargetAppContext {
val packageManager = context.packageManager
val packageInfo = packageManager.getPackageInfoCompat(packageName)
val applicationInfo = packageInfo.applicationInfo
?: packageManager.getApplicationInfo(packageName, 0)
val launchIntent = packageManager.getLaunchIntentForPackage(packageName)
val applicationLabel = runCatching {
applicationInfo.loadLabel(packageManager)?.toString()
}.getOrNull()
return TargetAppContext(
packageName = packageName,
applicationLabel = applicationLabel,
versionName = packageInfo.versionName,
versionCode = packageInfo.longVersionCodeCompat(),
launchIntentAction = launchIntent?.action,
launchIntentComponent = launchIntent?.component?.flattenToShortString(),
requestedPermissions = packageInfo.requestedPermissions
?.filterNotNull()
?.sorted()
?: emptyList(),
)
}
private fun PackageManager.getPackageInfoCompat(packageName: String): PackageInfo {
val flags = PackageManager.GET_PERMISSIONS
return if (Build.VERSION.SDK_INT >= 33) {
getPackageInfo(packageName, PackageManager.PackageInfoFlags.of(flags.toLong()))
} else {
@Suppress("DEPRECATION")
getPackageInfo(packageName, flags)
}
}
private fun PackageInfo.longVersionCodeCompat(): Long? {
return if (Build.VERSION.SDK_INT >= 28) {
longVersionCode
} else {
@Suppress("DEPRECATION")
versionCode.toLong()
}
}
}

View File

@@ -1,57 +0,0 @@
package com.openai.codex.genie
import org.junit.Assert.assertEquals
import org.junit.Test
class TargetAppContextTest {
@Test
fun describeForTraceUsesLabelAndTruncatesPermissions() {
val context = TargetAppContext(
packageName = "com.android.deskclock",
applicationLabel = "Clock",
versionName = "14",
versionCode = 42,
launchIntentAction = "android.intent.action.MAIN",
launchIntentComponent = "com.android.deskclock/.DeskClock",
requestedPermissions = listOf(
"android.permission.POST_NOTIFICATIONS",
"android.permission.SCHEDULE_EXACT_ALARM",
"android.permission.SET_ALARM",
"android.permission.WAKE_LOCK",
),
)
assertEquals(
"Clock (com.android.deskclock), version=14 (42), launcher=com.android.deskclock/.DeskClock action=android.intent.action.MAIN, permissions=android.permission.POST_NOTIFICATIONS, android.permission.SCHEDULE_EXACT_ALARM, android.permission.SET_ALARM (+1 more)",
context.describeForTrace(),
)
}
@Test
fun renderPromptSectionFallsBackWhenMetadataIsMissing() {
val context = TargetAppContext(
packageName = "com.example.target",
applicationLabel = null,
versionName = null,
versionCode = null,
launchIntentAction = null,
launchIntentComponent = null,
requestedPermissions = emptyList(),
)
assertEquals(
"""
Target app inspection:
- package: com.example.target
- label: com.example.target
- versionName: unknown
- versionCode: unknown
- launcherAction: unavailable
- launcherComponent: unavailable
- requestedPermissions:
- none declared or visible
""".trimIndent(),
context.renderPromptSection(),
)
}
}

View File

@@ -33,9 +33,9 @@ The current repo now contains these implementation slices:
- Direct child-session launch now runs through a dedicated hosted Agent
framework-session bridge, with Kotlin reduced to the framework/session host
layer.
- The Genie runtime inspects the paired target package from inside the
target-app sandbox and feeds package metadata plus launcher intent details
into the delegated Codex prompt.
- The Genie runtime now relies on the hosted Codex shell/tool path for target
package inspection, activity launch, input injection, and UI dumping instead
of host-side Kotlin wrappers for those operations.
- The hosted `codex app-server` process now talks to a **Genie-local loopback
HTTP proxy** inside the Genie app. That proxy forwards HTTP traffic to the
Agent over Binder/AIDL, keeping network/auth Agent-owned without assuming the
@@ -43,9 +43,9 @@ The current repo now contains these implementation slices:
- The Binder bridge now exposes a **narrow Responses transport** owned by the
Agent app itself, so Genie model traffic no longer depends on the legacy
`codexd` socket service.
- The Genie runtime still exposes Android-specific capabilities that are not
ordinary shell tools through host dynamic tools, but standard Android shell
and device commands stay in the normal Codex tool path.
- The Genie runtime now keeps host dynamic tools limited to framework-only
detached-target controls and frame capture, while standard Android shell and
device commands stay in the normal Codex tool path.
- Non-bridge Genie questions surface through AgentSDK question flow by mapping
`request_user_input` back into Agent-managed questions and answers.
- The Agent also attempts to answer Genie questions through its hosted Codex
@@ -143,8 +143,7 @@ foreground-service auth/status surface while this refactor proceeds.
- Exported Binder bridge request handling in `CodexAgentBridgeService`
- Binder bridge request issuance in `CodexGenieService`
- Agent-hosted runtime metadata for Genie bootstrap
- Target-app package metadata and launcher-intent inspection from the Genie
sandbox, with that context included in the delegated Codex prompt
- Shell-first Genie execution for package inspection, activity launch, input injection, and UI dumping
- Hosted `codex app-server` inside Genie, with model traffic routed through a
Genie-local proxy backed by the Agent Binder bridge
- Agent-owned `/v1/responses` proxying in
@@ -192,8 +191,8 @@ foreground-service auth/status surface while this refactor proceeds.
- `android/genie/src/main/java/com/openai/codex/genie/CodexGenieService.kt`
- Genie lifecycle host for the embedded `codex app-server`
- `android/genie/src/main/java/com/openai/codex/genie/CodexAppServerHost.kt`
- stdio JSON-RPC host for `codex app-server`, dynamic tools, and
`request_user_input` bridging
- stdio JSON-RPC host for `codex app-server`, framework-only dynamic tools,
and `request_user_input` bridging
- `android/genie/src/main/java/com/openai/codex/genie/GenieLocalCodexProxy.kt`
- Genie-local loopback HTTP proxy that forwards hosted `codex` HTTP traffic to
the Agent Binder bridge