When I use the Splunk API (from node.js) to query a given sid, I only get back 1000 results, even when supplying the count=0 argument. This particular sid happens to return a great many records -- over 6 million. When trying with an sid that returns much fewer records, say 5,000, they all are returned.
Here is my code:
var Request = require('request'); // 2.34.x
var options = {
url: 'htttps://splunksvr:8089/servicesNS/hector/search/search/jobs/[sid]/results?output_mode=json&count=0',
method: 'GET',
auth: {
user: 'hector',
pass: 'wouldntyouliketoknow'
},
rejectUnauthorized: false,
requestCert: true,
agent: false
};
Request(options, function(err, response, body) {
err && console.log('Error calling Splunk: ' + err);
body = JSON.parse(body);
body && body.results && console.log('query result count: ' + body.results.length);
});
Output:
query result count: 1000
I've also tried using pagination like this:
var Request = require('request');
var internals = {};
var options = {
url: 'htttps://splunksvr:8089/servicesNS/hector/search/search/jobs/[sid]/results?output_mode=json&count=300&offset={offset}',
method: 'GET',
auth: {
user: 'hector',
pass: 'wouldntyouliketoknow'
},
rejectUnauthorized: false,
requestCert: true,
agent: false
};
internals.querySplunk = function (options, offset, callback) {
options.originalUrl = options.url;
options.url = options.url.replace('{offset}', offset);
console.log(options.url);
Request(options, function(err, response, body) {
err && console.log('Error calling Splunk: ' + err);
body = JSON.parse(body);
var resultCount = body.results.length;
console.log('query result count: ' + resultCount);
if (resultCount === 0) {
console.log('Done getting results.');
return callback();
}
offset += 300;
options.url = options.originalUrl;
internals.querySplunk(options, offset, callback);
});
};
internals.querySplunk(options, 0, function () {
console.log('Exiting.');
});
Output:
htttps://splunksvr:8089/servicesNS/hector/search/search/jobs/[sid]/results?output_mode=json&count=300&offset=0
query result count: 300
htttps://splunksvr:8089/servicesNS/hector/search/search/jobs/[sid]/results?output_mode=json&count=300&offset=300
query result count: 300
htttps://splunksvr:8089/servicesNS/hector/search/search/jobs/[sid]/results?output_mode=json&count=300&offset=600
query result count: 300
htttps://splunksvr:8089/servicesNS/hector/search/search/jobs/[sid]/results?output_mode=json&count=300&offset=900
query result count: 100
htttps://splunksvr:8089/servicesNS/hector/search/search/jobs/[sid]/results?output_mode=json&count=300&offset=1200
query result count: 0
Done getting results.
Exiting.
As you can see, it still only pages to 1,000 results. What gives?
If you search is non-transforming (i.e: it returns events but not results, as it doesn't use commands like stats or timechart to perform aggregation), you don't want to hit the /services/search/jobs/{SID}/results endpoint:
Returns the results of the search specified by {search_id}. This is the table that exists after all processing from the search pipeline has completed. This is the primary method for a client to fetch a set of TRANSFORMED events. If the dispatched search does not include a transforming command, the effect is the same as get_events, however with fewer options.
...but rather the /services/search/jobs/{SID}/events endpoint:
Returns the events of the search specified by {search_id}. These events are the data from the search pipeline before the first "transforming" search command. This is the primary method for a client to fetch a set of UNTRANSFORMED events for the search job. This endpoint is only valid if the status_buckets > 0 or the search has no transforming commands.
Also, if your goal is to perform massive event export, the best method to leverage is /services/search/export endpoint.
If you search is non-transforming (i.e: it returns events but not results, as it doesn't use commands like stats or timechart to perform aggregation), you don't want to hit the /services/search/jobs/{SID}/results endpoint:
Returns the results of the search specified by {search_id}. This is the table that exists after all processing from the search pipeline has completed. This is the primary method for a client to fetch a set of TRANSFORMED events. If the dispatched search does not include a transforming command, the effect is the same as get_events, however with fewer options.
...but rather the /services/search/jobs/{SID}/events endpoint:
Returns the events of the search specified by {search_id}. These events are the data from the search pipeline before the first "transforming" search command. This is the primary method for a client to fetch a set of UNTRANSFORMED events for the search job. This endpoint is only valid if the status_buckets > 0 or the search has no transforming commands.
Also, if your goal is to perform massive event export, the best method to leverage is /services/search/export endpoint.
hexx: Thanks for your very succinct and insightful answer. It looks like the export endpoint is best for my needs.
I explained my question poorly - I apologize.
The details you gave above only talk about how you are requesting results from a specific search, not about how that search was created, which is what we need to know.
I am running a search in the Splunk client and then referencing its sid via the REST API query parameters.
As you can see from the code above, the API query (with parameters) is:
https://splunksvr:8089/servicesNS/hector/search/search/jobs/[sid]/results?output_mode=json&count=0
What search are you running? How are you running it? If it is from the REST API, what parameters are you sending to the REST API?