-
Notifications
You must be signed in to change notification settings - Fork 394
Add --diagnostic-port option to dotnet-stack #5592
New issue
Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.
By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.
Already on GitHub? Sign in to your account
base: main
Are you sure you want to change the base?
Changes from 10 commits
47ce0d2
682a87d
1c97f11
07c8f6c
a72e921
877a49d
ea77b81
87df662
3a3a765
7e8ef6b
79364ff
0e21b47
b7b3cf7
9fd1ccf
File filter
Filter by extension
Conversations
Jump to
Diff view
Diff view
There are no files selected for viewing
| Original file line number | Diff line number | Diff line change |
|---|---|---|
|
|
@@ -27,22 +27,44 @@ internal static class ReportCommandHandler | |
| /// <param name="processId">The process to report the stack from.</param> | ||
| /// <param name="name">The name of process to report the stack from.</param> | ||
| /// <param name="duration">The duration of to trace the target for. </param> | ||
| /// <param name="diagnosticPort">The diagnostic port to connect to.</param> | ||
| /// <returns></returns> | ||
| private static async Task<int> Report(CancellationToken ct, TextWriter stdOutput, TextWriter stdError, int processId, string name, TimeSpan duration) | ||
| private static async Task<int> Report(CancellationToken ct, TextWriter stdOutput, TextWriter stdError, int processId, string name, TimeSpan duration, string diagnosticPort) | ||
| { | ||
| string tempNetTraceFilename = Path.Join(Path.GetTempPath(), Path.GetRandomFileName() + ".nettrace"); | ||
| string tempEtlxFilename = ""; | ||
|
|
||
| try | ||
| { | ||
| // Either processName or processId has to be specified. | ||
| // Validate that only one of processId, name, or diagnosticPort is specified | ||
| int optionCount = 0; | ||
| if (processId != 0) | ||
| { | ||
| optionCount++; | ||
| } | ||
| if (!string.IsNullOrEmpty(name)) | ||
| { | ||
| optionCount++; | ||
| } | ||
| if (!string.IsNullOrEmpty(diagnosticPort)) | ||
| { | ||
| optionCount++; | ||
| } | ||
|
|
||
| if (optionCount == 0) | ||
| { | ||
| stdError.WriteLine("--process-id, --name, or --diagnostic-port is required"); | ||
| return -1; | ||
| } | ||
| else if (optionCount > 1) | ||
| { | ||
| stdError.WriteLine("Only one of --process-id, --name, or --diagnostic-port can be specified"); | ||
| return -1; | ||
| } | ||
|
|
||
| // Resolve process name to ID if needed | ||
| if (!string.IsNullOrEmpty(name)) | ||
| { | ||
| if (processId != 0) | ||
| { | ||
| Console.WriteLine("Can only specify either --name or --process-id option."); | ||
| return -1; | ||
| } | ||
| processId = CommandUtils.FindProcessIdWithName(name); | ||
| if (processId < 0) | ||
| { | ||
|
|
@@ -55,91 +77,125 @@ private static async Task<int> Report(CancellationToken ct, TextWriter stdOutput | |
| stdError.WriteLine("Process ID should not be negative."); | ||
| return -1; | ||
| } | ||
| else if (processId == 0) | ||
|
|
||
| DiagnosticsClientBuilder builder = new("dotnet-stack", 10); | ||
| using (DiagnosticsClientHolder holder = await builder.Build(ct, processId, diagnosticPort, showChildIO: false, printLaunchCommand: false).ConfigureAwait(false)) | ||
| { | ||
| stdError.WriteLine("--process-id is required"); | ||
| return -1; | ||
| } | ||
| if (holder == null) | ||
| { | ||
| return -1; | ||
| } | ||
|
|
||
| DiagnosticsClient client = holder.Client; | ||
|
|
||
| DiagnosticsClient client = new(processId); | ||
| List<EventPipeProvider> providers = new() | ||
| { | ||
| new EventPipeProvider("Microsoft-DotNETCore-SampleProfiler", EventLevel.Informational) | ||
| }; | ||
|
|
||
| // collect a *short* trace with stack samples | ||
| // the hidden '--duration' flag can increase the time of this trace in case 10ms | ||
| // is too short in a given environment, e.g., resource constrained systems | ||
| // N.B. - This trace INCLUDES rundown. For sufficiently large applications, it may take non-trivial time to collect | ||
| // the symbol data in rundown. | ||
| EventPipeSession session = await client.StartEventPipeSessionAsync(providers, requestRundown:true, token:ct).ConfigureAwait(false); | ||
| using (session) | ||
| using (FileStream fs = File.OpenWrite(tempNetTraceFilename)) | ||
| { | ||
| Task copyTask = session.EventStream.CopyToAsync(fs, ct); | ||
| await Task.Delay(duration, ct).ConfigureAwait(false); | ||
| await session.StopAsync(ct).ConfigureAwait(false); | ||
|
|
||
| // check if rundown is taking more than 5 seconds and add comment to report | ||
| Task timeoutTask = Task.Delay(TimeSpan.FromSeconds(5)); | ||
| Task completedTask = await Task.WhenAny(copyTask, timeoutTask).ConfigureAwait(false); | ||
| if (completedTask == timeoutTask) | ||
| // Resume runtime if it was suspended (similar to --resume-runtime:true in other tools) | ||
| // This is safe to call even if the runtime wasn't suspended - it's a no-op in that case | ||
| try | ||
| { | ||
| stdOutput.WriteLine($"# Sufficiently large applications can cause this reportCommand to take non-trivial amounts of time"); | ||
| await client.ResumeRuntimeAsync(ct).ConfigureAwait(false); | ||
| } | ||
| catch (Exception) | ||
| { | ||
| // ResumeRuntime is a no-op if the runtime wasn't suspended, | ||
| // so we can safely ignore exceptions here | ||
| } | ||
| await copyTask.ConfigureAwait(false); | ||
| } | ||
|
|
||
| // using the generated trace file, symbolicate and compute stacks. | ||
| tempEtlxFilename = TraceLog.CreateFromEventPipeDataFile(tempNetTraceFilename); | ||
| using (SymbolReader symbolReader = new(TextWriter.Null) { SymbolPath = SymbolPath.MicrosoftSymbolServerPath }) | ||
| using (TraceLog eventLog = new(tempEtlxFilename)) | ||
| { | ||
| MutableTraceEventStackSource stackSource = new(eventLog) | ||
| List<EventPipeProvider> providers = new() | ||
| { | ||
| OnlyManagedCodeStacks = true | ||
| new EventPipeProvider("Microsoft-DotNETCore-SampleProfiler", EventLevel.Informational) | ||
| }; | ||
|
|
||
| SampleProfilerThreadTimeComputer computer = new(eventLog, symbolReader); | ||
| computer.GenerateThreadTimeStacks(stackSource); | ||
|
|
||
| Dictionary<int, List<StackSourceSample>> samplesForThread = new(); | ||
| // collect a *short* trace with stack samples | ||
| // the hidden '--duration' flag can increase the time of this trace in case 10ms | ||
| // is too short in a given environment, e.g., resource constrained systems | ||
| // N.B. - This trace INCLUDES rundown. For sufficiently large applications, it may take non-trivial time to collect | ||
| // the symbol data in rundown. | ||
| EventPipeSession session; | ||
| try | ||
| { | ||
| session = await client.StartEventPipeSessionAsync(providers, requestRundown:true, token:ct).ConfigureAwait(false); | ||
| } | ||
| catch (Exception) | ||
| { | ||
| stdError.WriteLine(EventPipeErrorMessage); | ||
| return -1; | ||
| } | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. We are now overriding all exceptions and their stacktraces with a blanket exception message, is that better?
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I think it's an improvement over current state. I don't think 'read past the end of the stream' gives us enough to tell what was wrong.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Does this particular API throw the EndOfStreamException? I wouldn't have expected it to. This API starts the session but the only thing it reads is the immediate reply from the runtime acknowledging the session start request. I think there are a couple potential issues here:
I think the ideal fix here is to determine the root cause of the EndOfStreamException assuming we can repro it. If that isn't possible then we probably want to do something that both helps users get a vague idea what went wrong while still including the exact Exception stack trace + Exception message to help us diagnose the bug when it occurs in the wild.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. Agree, we should hunt down what triggers the error and either make a more graceful handling of it and/or report the error better. Handling all exceptions the same way is probably to limiting and might hide valueable details if we hit cases when don't have good error handling for. dotnet-stack runs sample profiler for 10 ms and then run logic to analyze the samples, so maybe there is errors in its logic in case no data is coming back from sample profiler that could be detected differently. It also waits 5 seconds for rundown to complete, not sure how that works when the runtime is suspended. |
||
|
|
||
| stackSource.ForEach((sample) => { | ||
| StackSourceCallStackIndex stackIndex = sample.StackIndex; | ||
| while (!stackSource.GetFrameName(stackSource.GetFrameIndex(stackIndex), false).StartsWith("Thread (")) | ||
| try | ||
|
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. I'm not quite following, what was the rationale for adding try/catch for these three blocks (this plus the two above). Also couldn't this and the above be merged under the same try/catch, they seem like the same format
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. There is broad catch handler here that serves as our fallback for unanticipated errors. Inside that scope we generally only catch narrower Exception types that we know how to handle or how to describe to the user in a meaningful way.
Member
There was a problem hiding this comment. Choose a reason for hiding this commentThe reason will be displayed to describe this comment to others. Learn more. @copilot, please remove this try catch handler and let Exceptions flow into the outer catch handler. |
||
| { | ||
| using (session) | ||
| using (FileStream fs = File.OpenWrite(tempNetTraceFilename)) | ||
| { | ||
| stackIndex = stackSource.GetCallerIndex(stackIndex); | ||
| Task copyTask = session.EventStream.CopyToAsync(fs, ct); | ||
| await Task.Delay(duration, ct).ConfigureAwait(false); | ||
| await session.StopAsync(ct).ConfigureAwait(false); | ||
|
|
||
| // check if rundown is taking more than 5 seconds and add comment to report | ||
| Task timeoutTask = Task.Delay(TimeSpan.FromSeconds(5)); | ||
| Task completedTask = await Task.WhenAny(copyTask, timeoutTask).ConfigureAwait(false); | ||
| if (completedTask == timeoutTask) | ||
| { | ||
| stdOutput.WriteLine($"# Sufficiently large applications can cause this reportCommand to take non-trivial amounts of time"); | ||
| } | ||
| await copyTask.ConfigureAwait(false); | ||
| } | ||
| } | ||
| catch (Exception) | ||
| { | ||
| stdError.WriteLine(EventPipeErrorMessage); | ||
| return -1; | ||
| } | ||
|
|
||
| // long form for: int.Parse(threadFrame["Thread (".Length..^1)]) | ||
| // Thread id is in the frame name as "Thread (<ID>)" | ||
| string template = "Thread ("; | ||
| string threadFrame = stackSource.GetFrameName(stackSource.GetFrameIndex(stackIndex), false); | ||
| // using the generated trace file, symbolicate and compute stacks. | ||
| tempEtlxFilename = TraceLog.CreateFromEventPipeDataFile(tempNetTraceFilename); | ||
| using (SymbolReader symbolReader = new(TextWriter.Null) { SymbolPath = SymbolPath.MicrosoftSymbolServerPath }) | ||
| using (TraceLog eventLog = new(tempEtlxFilename)) | ||
| { | ||
| MutableTraceEventStackSource stackSource = new(eventLog) | ||
| { | ||
| OnlyManagedCodeStacks = true | ||
| }; | ||
|
|
||
| // we are looking for the first index of ) because | ||
| // we need to handle a thread name like: Thread (4008) (.NET IO ThreadPool Worker) | ||
| int firstIndex = threadFrame.IndexOf(')'); | ||
| int threadId = int.Parse(threadFrame.AsSpan(template.Length, firstIndex - template.Length)); | ||
| SampleProfilerThreadTimeComputer computer = new(eventLog, symbolReader); | ||
| computer.GenerateThreadTimeStacks(stackSource); | ||
|
|
||
| if (samplesForThread.TryGetValue(threadId, out List<StackSourceSample> samples)) | ||
| { | ||
| samples.Add(sample); | ||
| } | ||
| else | ||
| { | ||
| samplesForThread[threadId] = new List<StackSourceSample>() { sample }; | ||
| } | ||
| }); | ||
| Dictionary<int, List<StackSourceSample>> samplesForThread = new(); | ||
|
|
||
| // For every thread recorded in our trace, print the first stack | ||
| foreach ((int threadId, List<StackSourceSample> samples) in samplesForThread) | ||
| { | ||
| stackSource.ForEach((sample) => { | ||
| StackSourceCallStackIndex stackIndex = sample.StackIndex; | ||
| while (!stackSource.GetFrameName(stackSource.GetFrameIndex(stackIndex), false).StartsWith("Thread (")) | ||
| { | ||
| stackIndex = stackSource.GetCallerIndex(stackIndex); | ||
| } | ||
|
|
||
| // long form for: int.Parse(threadFrame["Thread (".Length..^1)]) | ||
| // Thread id is in the frame name as "Thread (<ID>)" | ||
| string template = "Thread ("; | ||
| string threadFrame = stackSource.GetFrameName(stackSource.GetFrameIndex(stackIndex), false); | ||
|
|
||
| // we are looking for the first index of ) because | ||
| // we need to handle a thread name like: Thread (4008) (.NET IO ThreadPool Worker) | ||
| int firstIndex = threadFrame.IndexOf(')'); | ||
| int threadId = int.Parse(threadFrame.AsSpan(template.Length, firstIndex - template.Length)); | ||
|
|
||
| if (samplesForThread.TryGetValue(threadId, out List<StackSourceSample> samples)) | ||
| { | ||
| samples.Add(sample); | ||
| } | ||
| else | ||
| { | ||
| samplesForThread[threadId] = new List<StackSourceSample>() { sample }; | ||
| } | ||
| }); | ||
|
|
||
| // For every thread recorded in our trace, print the first stack | ||
| foreach ((int threadId, List<StackSourceSample> samples) in samplesForThread) | ||
| { | ||
| #if DEBUG | ||
| stdOutput.WriteLine($"Found {samples.Count} stacks for thread 0x{threadId:X}"); | ||
| stdOutput.WriteLine($"Found {samples.Count} stacks for thread 0x{threadId:X}"); | ||
| #endif | ||
| PrintStack(stdOutput, threadId, samples[0], stackSource); | ||
| PrintStack(stdOutput, threadId, samples[0], stackSource); | ||
| } | ||
| } | ||
| } | ||
| } | ||
|
|
@@ -189,15 +245,17 @@ public static Command ReportCommand() | |
| { | ||
| ProcessIdOption, | ||
| NameOption, | ||
| DurationOption | ||
| DurationOption, | ||
| DiagnosticPortOption | ||
| }; | ||
|
|
||
| reportCommand.SetAction((parseResult, ct) => Report(ct, | ||
| stdOutput: parseResult.Configuration.Output, | ||
| stdError: parseResult.Configuration.Error, | ||
| processId: parseResult.GetValue(ProcessIdOption), | ||
| name: parseResult.GetValue(NameOption), | ||
| duration: parseResult.GetValue(DurationOption))); | ||
| duration: parseResult.GetValue(DurationOption), | ||
| diagnosticPort: parseResult.GetValue(DiagnosticPortOption))); | ||
|
|
||
| return reportCommand; | ||
| } | ||
|
|
@@ -221,5 +279,14 @@ public static Command ReportCommand() | |
| { | ||
| Description = "The name of the process to report the stack." | ||
| }; | ||
|
|
||
| private static readonly Option<string> DiagnosticPortOption = | ||
| new("--diagnostic-port", "--dport") | ||
| { | ||
| Description = "The path to a diagnostic port to be used." | ||
| }; | ||
|
|
||
| private const string EventPipeErrorMessage = | ||
| "There was a failure in reading stack data. Possible reasons could be trying to connect to a runtime that has been suspended, an unexpected close of the IPC channel, etc."; | ||
| } | ||
| } | ||
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Its a little confusing that if it we deemed it safe to call ResumeRuntimeAsync, that we would expect exceptions. If the runtime was suspended and we hit an exception, we are now ignoring it, is that right?
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Agreed, we probably want to follow the precedent established by other tools:
dotnet-trace
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
@copilot - please follow the Exception handling pattern used by dotnet-trace for this call as shown here: https://github.com/dotnet/diagnostics/blob/main/src/Tools/dotnet-trace/CommandLine/Commands/CollectCommand.cs#L341-L348
There was a problem hiding this comment.
Choose a reason for hiding this comment
The reason will be displayed to describe this comment to others. Learn more.
Done! Updated the exception handling to follow the dotnet-trace pattern. Now only catching
UnsupportedCommandExceptionfor the ResumeRuntimeAsync call, and removed the overly broad try-catch blocks that were hiding diagnostic information. Exceptions now bubble up with full stack traces. Commit f3a8c2d.