aboutsummaryrefslogtreecommitdiff
diff options
context:
space:
mode:
-rw-r--r--Sema/Sema.h8
-rw-r--r--Sema/SemaChecking.cpp292
-rw-r--r--Sema/SemaExpr.cpp2
-rw-r--r--include/clang/Basic/DiagnosticKinds.def20
-rw-r--r--test/Sema/format-strings.c41
5 files changed, 329 insertions, 34 deletions
diff --git a/Sema/Sema.h b/Sema/Sema.h
index d373cc0400..22fffc7299 100644
--- a/Sema/Sema.h
+++ b/Sema/Sema.h
@@ -423,10 +423,14 @@ private:
// Extra semantic analysis beyond the C type system
private:
- void CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl,
+ void CheckFunctionCall(Expr *Fn,
+ SourceLocation LParenLoc, SourceLocation RParenLoc,
+ FunctionDecl *FDecl,
Expr** Args, unsigned NumArgsInCall);
- void CheckPrintfArguments(Expr *Fn, unsigned id_idx, FunctionDecl *FDecl,
+ void CheckPrintfArguments(Expr *Fn,
+ SourceLocation LParenLoc, SourceLocation RParenLoc,
+ bool HasVAListArg, FunctionDecl *FDecl,
unsigned format_idx, Expr** Args,
unsigned NumArgsInCall);
};
diff --git a/Sema/SemaChecking.cpp b/Sema/SemaChecking.cpp
index 8cc3c6c147..511f56f2a8 100644
--- a/Sema/SemaChecking.cpp
+++ b/Sema/SemaChecking.cpp
@@ -29,7 +29,9 @@ using namespace clang;
/// CheckFunctionCall - Check a direct function call for various correctness
/// and safety properties not strictly enforced by the C type system.
void
-Sema::CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl,
+Sema::CheckFunctionCall(Expr *Fn,
+ SourceLocation LParenLoc, SourceLocation RParenLoc,
+ FunctionDecl *FDecl,
Expr** Args, unsigned NumArgsInCall) {
// Get the IdentifierInfo* for the called function.
@@ -37,55 +39,287 @@ Sema::CheckFunctionCall(Expr *Fn, FunctionDecl *FDecl,
// Search the KnownFunctionIDs for the identifier.
unsigned i = 0, e = id_num_known_functions;
- for ( ; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; }
- if( i == e ) return;
+ for (; i != e; ++i) { if (KnownFunctionIDs[i] == FnInfo) break; }
+ if (i == e) return;
// Printf checking.
if (i <= id_vprintf) {
- // Retrieve the index of the format string parameter.
+ // Retrieve the index of the format string parameter and determine
+ // if the function is passed a va_arg argument.
unsigned format_idx = 0;
+ bool HasVAListArg = false;
+
switch (i) {
default: assert(false && "No format string argument index.");
case id_printf: format_idx = 0; break;
case id_fprintf: format_idx = 1; break;
case id_sprintf: format_idx = 1; break;
case id_snprintf: format_idx = 2; break;
- case id_vsnprintf: format_idx = 2; break;
- case id_asprintf: format_idx = 1; break;
- case id_vasprintf: format_idx = 1; break;
- case id_vfprintf: format_idx = 1; break;
- case id_vsprintf: format_idx = 1; break;
- case id_vprintf: format_idx = 1; break;
- }
- CheckPrintfArguments(Fn, i, FDecl, format_idx, Args, NumArgsInCall);
+ case id_asprintf: format_idx = 1; HasVAListArg = true; break;
+ case id_vsnprintf: format_idx = 2; HasVAListArg = true; break;
+ case id_vasprintf: format_idx = 1; HasVAListArg = true; break;
+ case id_vfprintf: format_idx = 1; HasVAListArg = true; break;
+ case id_vsprintf: format_idx = 1; HasVAListArg = true; break;
+ case id_vprintf: format_idx = 0; HasVAListArg = true; break;
+ }
+
+ CheckPrintfArguments(Fn, LParenLoc, RParenLoc, HasVAListArg,
+ FDecl, format_idx, Args, NumArgsInCall);
}
}
/// CheckPrintfArguments - Check calls to printf (and similar functions) for
-/// correct use of format strings. Improper format strings to functions in
-/// the printf family can be the source of bizarre bugs and very serious
-/// security holes. A good source of information is available in the following
-/// paper (which includes additional references):
+/// correct use of format strings.
+///
+/// HasVAListArg - A predicate indicating whether the printf-like
+/// function is passed an explicit va_arg argument (e.g., vprintf)
+///
+/// format_idx - The index into Args for the format string.
+///
+/// Improper format strings to functions in the printf family can be
+/// the source of bizarre bugs and very serious security holes. A
+/// good source of information is available in the following paper
+/// (which includes additional references):
///
/// FormatGuard: Automatic Protection From printf Format String
/// Vulnerabilities, Proceedings of the 10th USENIX Security Symposium, 2001.
+///
+/// Functionality implemented:
+///
+/// We can statically check the following properties for string
+/// literal format strings for non v.*printf functions (where the
+/// arguments are passed directly):
+//
+/// (1) Are the number of format conversions equal to the number of
+/// data arguments?
+///
+/// (2) Does each format conversion correctly match the type of the
+/// corresponding data argument? (TODO)
+///
+/// Moreover, for all printf functions we can:
+///
+/// (3) Check for a missing format string (when not caught by type checking).
+///
+/// (4) Check for no-operation flags; e.g. using "#" with format
+/// conversion 'c' (TODO)
+///
+/// (5) Check the use of '%n', a major source of security holes.
+///
+/// (6) Check for malformed format conversions that don't specify anything.
+///
+/// (7) Check for empty format strings. e.g: printf("");
+///
+/// (8) Check that the format string is a wide literal.
+///
+/// All of these checks can be done by parsing the format string.
+///
+/// For now, we ONLY do (1), (3), (5), (6), (7), and (8).
void
-Sema::CheckPrintfArguments(Expr *Fn, unsigned id_idx, FunctionDecl *FDecl,
+Sema::CheckPrintfArguments(Expr *Fn,
+ SourceLocation LParenLoc, SourceLocation RParenLoc,
+ bool HasVAListArg, FunctionDecl *FDecl,
unsigned format_idx, Expr** Args,
unsigned NumArgsInCall) {
-
- assert( format_idx < NumArgsInCall );
-
+ // CHECK: printf-like function is called with no format string.
+ if (format_idx >= NumArgsInCall) {
+ Diag(RParenLoc, diag::warn_printf_missing_format_string,
+ Fn->getSourceRange());
+ return;
+ }
+
// CHECK: format string is not a string literal.
//
- // Dynamically generated format strings are difficult to automatically
- // vet at compile time. Requiring that format strings are string literals
- // (1) permits the checking of format strings by the compiler and thereby
- // (2) can practically remove the source of many format string exploits.
-
+ // Dynamically generated format strings are difficult to
+ // automatically vet at compile time. Requiring that format strings
+ // are string literals: (1) permits the checking of format strings by
+ // the compiler and thereby (2) can practically remove the source of
+ // many format string exploits.
StringLiteral *FExpr = dyn_cast<StringLiteral>(Args[format_idx]);
- if ( FExpr == NULL )
- Diag( Args[format_idx]->getLocStart(),
- diag::warn_printf_not_string_constant, Fn->getSourceRange() );
-} \ No newline at end of file
+ if (FExpr == NULL) {
+ Diag(Args[format_idx]->getLocStart(),
+ diag::warn_printf_not_string_constant, Fn->getSourceRange());
+ return;
+ }
+
+ // CHECK: is the format string a wide literal?
+ if (FExpr->isWide()) {
+ Diag(Args[format_idx]->getLocStart(),
+ diag::warn_printf_format_string_is_wide_literal,
+ Fn->getSourceRange());
+ return;
+ }
+
+ // Str - The format string. NOTE: this is NOT null-terminated!
+ const char * const Str = FExpr->getStrData();
+
+ // CHECK: empty format string?
+ const unsigned StrLen = FExpr->getByteLength();
+
+ if (StrLen == 0) {
+ Diag(Args[format_idx]->getLocStart(),
+ diag::warn_printf_empty_format_string, Fn->getSourceRange());
+ return;
+ }
+
+ // We process the format string using a binary state machine. The
+ // current state is stored in CurrentState.
+ enum {
+ state_OrdChr,
+ state_Conversion
+ } CurrentState = state_OrdChr;
+
+ // numConversions - The number of conversions seen so far. This is
+ // incremented as we traverse the format string.
+ unsigned numConversions = 0;
+
+ // numDataArgs - The number of data arguments after the format
+ // string. This can only be determined for non vprintf-like
+ // functions. For those functions, this value is 1 (the sole
+ // va_arg argument).
+ unsigned numDataArgs = NumArgsInCall-(format_idx+1);
+
+ // Inspect the format string.
+ unsigned StrIdx = 0;
+
+ // LastConversionIdx - Index within the format string where we last saw
+ // a '%' character that starts a new format conversion.
+ unsigned LastConversionIdx = 0;
+
+ for ( ; StrIdx < StrLen ; ++StrIdx ) {
+
+ // Is the number of detected conversion conversions greater than
+ // the number of matching data arguments? If so, stop.
+ if (!HasVAListArg && numConversions > numDataArgs) break;
+
+ // Handle "\0"
+ if(Str[StrIdx] == '\0' ) {
+ // The string returned by getStrData() is not null-terminated,
+ // so the presence of a null character is likely an error.
+
+ SourceLocation Loc =
+ PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),StrIdx+1);
+
+ Diag(Loc, diag::warn_printf_format_string_contains_null_char,
+ Fn->getSourceRange());
+
+ return;
+ }
+
+ // Ordinary characters (not processing a format conversion).
+ if (CurrentState == state_OrdChr) {
+ if (Str[StrIdx] == '%') {
+ CurrentState = state_Conversion;
+ LastConversionIdx = StrIdx;
+ }
+ continue;
+ }
+
+ // Seen '%'. Now processing a format conversion.
+ switch (Str[StrIdx]) {
+ // Characters which can terminate a format conversion
+ // (e.g. "%d"). Characters that specify length modifiers or
+ // other flags are handled by the default case below.
+ //
+ // TODO: additional checks will go into the following cases.
+ case 'i':
+ case 'd':
+ case 'o':
+ case 'u':
+ case 'x':
+ case 'X':
+ case 'D':
+ case 'O':
+ case 'U':
+ case 'e':
+ case 'E':
+ case 'f':
+ case 'F':
+ case 'g':
+ case 'G':
+ case 'a':
+ case 'A':
+ case 'c':
+ case 'C':
+ case 'S':
+ case 's':
+ case 'P':
+ ++numConversions;
+ CurrentState = state_OrdChr;
+ break;
+
+ // CHECK: Are we using "%n"? Issue a warning.
+ case 'n': {
+ ++numConversions;
+ CurrentState = state_OrdChr;
+ SourceLocation Loc =
+ PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
+ LastConversionIdx+1);
+
+ Diag(Loc, diag::warn_printf_write_back, Fn->getSourceRange());
+ break;
+ }
+
+ // Handle "%%"
+ case '%':
+ // Sanity check: Was the first "%" character the previous one?
+ // If not, we will assume that we have a malformed format
+ // conversion, and that the current "%" character is the start
+ // of a new conversion.
+ if (StrIdx - LastConversionIdx == 1)
+ CurrentState = state_OrdChr;
+ else {
+ // Issue a warning: invalid format conversion.
+ SourceLocation Loc =
+ PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
+ LastConversionIdx+1);
+
+ Diag(Loc, diag::warn_printf_invalid_conversion,
+ std::string(Str+LastConversionIdx, Str+StrIdx),
+ Fn->getSourceRange());
+
+ // This conversion is broken. Advance to the next format
+ // conversion.
+ LastConversionIdx = StrIdx;
+ ++numConversions;
+ }
+
+ break;
+
+ default:
+ // This case catches all other characters: flags, widths, etc.
+ // We should eventually process those as well.
+ break;
+ }
+ }
+
+ if (CurrentState == state_Conversion) {
+ // Issue a warning: invalid format conversion.
+ SourceLocation Loc =
+ PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
+ LastConversionIdx+1);
+
+ Diag(Loc, diag::warn_printf_invalid_conversion,
+ std::string(Str+LastConversionIdx, Str+StrIdx),
+ Fn->getSourceRange());
+ return;
+ }
+
+ if (!HasVAListArg) {
+ // CHECK: Does the number of format conversions exceed the number
+ // of data arguments?
+ if (numConversions > numDataArgs) {
+ SourceLocation Loc =
+ PP.AdvanceToTokenCharacter(Args[format_idx]->getLocStart(),
+ LastConversionIdx);
+
+ Diag(Loc, diag::warn_printf_insufficient_data_args,
+ Fn->getSourceRange());
+ }
+ // CHECK: Does the number of data arguments exceed the number of
+ // format conversions in the format string?
+ else if (numConversions < numDataArgs)
+ Diag(Args[format_idx+numConversions+1]->getLocStart(),
+ diag::warn_printf_too_many_data_args, Fn->getSourceRange());
+ }
+}
diff --git a/Sema/SemaExpr.cpp b/Sema/SemaExpr.cpp
index 6b871b4812..f69867fdf1 100644
--- a/Sema/SemaExpr.cpp
+++ b/Sema/SemaExpr.cpp
@@ -561,7 +561,7 @@ ParseCallExpr(ExprTy *fn, SourceLocation LParenLoc,
if (ImplicitCastExpr *IcExpr = dyn_cast<ImplicitCastExpr>(Fn))
if (DeclRefExpr *DRExpr = dyn_cast<DeclRefExpr>(IcExpr->getSubExpr()))
if (FunctionDecl *FDecl = dyn_cast<FunctionDecl>(DRExpr->getDecl()))
- CheckFunctionCall(Fn, FDecl, Args, NumArgsInCall);
+ CheckFunctionCall(Fn, LParenLoc, RParenLoc, FDecl, Args, NumArgsInCall);
return new CallExpr(Fn, Args, NumArgsInCall, resultType, RParenLoc);
}
diff --git a/include/clang/Basic/DiagnosticKinds.def b/include/clang/Basic/DiagnosticKinds.def
index 041f5537dd..9137a8419f 100644
--- a/include/clang/Basic/DiagnosticKinds.def
+++ b/include/clang/Basic/DiagnosticKinds.def
@@ -661,8 +661,24 @@ DIAG(warn_unused_expr, WARNING,
// Printf checking
DIAG(warn_printf_not_string_constant, WARNING,
- "format string is not a string literal")
-
+ "format string is not a string literal (potentially insecure)")
+DIAG(warn_printf_write_back, WARNING,
+ "use of '%n' in format string discouraged (potentially insecure)")
+DIAG(warn_printf_insufficient_data_args, WARNING,
+ "more '%' conversions than data arguments")
+DIAG(warn_printf_too_many_data_args, WARNING,
+ "more data arguments than '%' conversions")
+DIAG(warn_printf_invalid_conversion, WARNING,
+ "invalid conversion '%0'")
+DIAG(warn_printf_missing_format_string, WARNING,
+ "format string missing")
+DIAG(warn_printf_empty_format_string, WARNING,
+ "format string is empty")
+DIAG(warn_printf_format_string_is_wide_literal, WARNING,
+ "format string should not be a wide string")
+DIAG(warn_printf_format_string_contains_null_char, WARNING,
+ "format string contains '\\0' within the string body")
+
// Statements.
DIAG(err_continue_not_in_loop, ERROR,
"'continue' statement not in loop statement")
diff --git a/test/Sema/format-strings.c b/test/Sema/format-strings.c
index f71cd58645..403da07b0f 100644
--- a/test/Sema/format-strings.c
+++ b/test/Sema/format-strings.c
@@ -21,3 +21,44 @@ void check_string_literal( FILE* fp, const char* s, char *buf, ... ) {
vsnprintf(buf,2,s,ap); // expected-warning {{mat string is not a string lit}}
}
+void check_writeback_specifier()
+{
+ int x;
+ char *b;
+
+ printf("%n",&x); // expected-warning {{'%n' in format string discouraged}}
+ sprintf(b,"%d%%%n",1, &x); // expected-warning {{'%n' in format string dis}}
+}
+
+void check_invalid_specifier(FILE* fp, char *buf)
+{
+ printf("%s%lb%d","unix",10,20); // expected-warning {{lid conversion '%lb'}}
+ fprintf(fp,"%%%l"); // expected-warning {{lid conversion '%l'}}
+ sprintf(buf,"%%%%%ld%d%d", 1, 2, 3); // no-warning
+ snprintf(buf, 2, "%%%%%ld%;%d", 1, 2, 3); // expected-warning {{sion '%;'}}
+}
+
+void check_null_char_string(char* b)
+{
+ printf("\0this is bogus%d",1); // expected-warning {{string contains '\0'}}
+ snprintf(b,10,"%%%%%d\0%d",1,2); // expected-warning {{string contains '\0'}}
+ printf("%\0d",1); // expected-warning {{string contains '\0'}}
+}
+
+void check_empty_format_string(char* buf)
+{
+ va_list ap;
+ va_start(ap,buf);
+ vprintf("",ap); // expected-warning {{format string is empty}}
+ sprintf(buf,""); // expected-warning {{format string is empty}}
+}
+
+void check_wide_string()
+{
+ char *b;
+ va_list ap;
+ va_start(ap,b);
+
+ printf(L"foo %d",2); // expected-warning {{should not be a wide string}}
+ vasprintf(&b,L"bar %d",2); // expected-warning {{should not be a wide string}}
+}