Ticket #3679: imdb.pl

File imdb.pl, 19.0 KB (added by anonymous, 18 years ago)

imdb.pl fix

Line 
1#!/usr/bin/perl -w
2
3#
4# This perl script is intended to perform movie data lookups based on
5# the popular www.imdb.com website
6#
7# For more information on MythVideo's external movie lookup mechanism, see
8# the README file in this directory.
9#
10# Author: Tim Harvey (tharvey AT alumni.calpoly DOT edu)
11# Modified: Andrei Rjeousski
12# v1.1
13# - Added amazon.com covers and improved handling for imdb posters
14# v1.2
15# - when searching amazon, try searching for main movie name and if nothing
16# is found, search for informal name
17# - better handling for amazon posters, see if movie title is a substring
18# in the search results returned by amazon
19# - fixed redirects for some movies on impawards
20# v1.3
21# - fixed search for low res images (imdb changed the page layout)
22# - added cinemablend poster search
23# - added nexbase poster search
24# - removed amazon.com searching for now
25
26# changes:
27# 9-10-2006: Anduin Withers
28# Changed output to utf8
29# 30-6-2007 fix for new imdb.com design
30
31use LWP::Simple; # libwww-perl providing simple HTML get actions
32use HTML::Entities;
33use URI::Escape;
34
35
36use vars qw($opt_h $opt_r $opt_d $opt_i $opt_v $opt_D $opt_M $opt_P);
37use Getopt::Std;
38
39$title = "IMDB Query";
40$version = "v1.3";
41$author = "Tim Harvey, Andrei Rjeousski";
42
43binmode(STDOUT, ":utf8");
44
45# display usage
46sub usage {
47 print "usage: $0 -hdrviMPD [parameters]\n";
48 print " -h help\n";
49 print " -d debug\n";
50 print " -r dump raw query result data only\n";
51 print " -v display version\n";
52 print " -i display info\n";
53 print "\n";
54 print " -M [options] <query> get movie list\n";
55 print " some known options are:\n";
56 print " type=[fuzy] looser search\n";
57 print " from_year=[int] limit matches to year\n";
58 print " to_year=[int] limit matches to year\n";
59 print " sort=[smart] ??\n";
60 print " tv=[no|both|only] limits between tv and movies\n";
61 print " Note: multiple options must be separated by ';'\n";
62 print " -P <movieid> get movie poster\n";
63 print " -D <movieid> get movie data\n";
64 exit(-1);
65}
66
67# display 1-line of info that describes the version of the program
68sub version {
69 print "$title ($version) by $author\n"
70}
71
72# display 1-line of info that can describe the type of query used
73sub info {
74 print "Performs queries using the www.imdb.com website.\n";
75}
76
77# display detailed help
78sub help {
79 version();
80 info();
81 usage();
82}
83
84sub trim {
85 my ($str) = @_;
86 $str =~ s/^\s+//;
87 $str =~ s/\s+$//;
88 return $str;
89}
90
91# returns text within 'data' between 'beg' and 'end' matching strings
92sub parseBetween {
93 my ($data, $beg, $end)=@_; # grab parameters
94
95 my $ldata = lc($data);
96 my $start = index($ldata, lc($beg)) + length($beg);
97 my $finish = index($ldata, lc($end), $start);
98 if ($start != (length($beg) -1) && $finish != -1) {
99 my $result = substr($data, $start, $finish - $start);
100 # return w/ decoded numeric character references
101 # (see http://www.w3.org/TR/html4/charset.html#h-5.3.1)
102 decode_entities($result);
103 return $result;
104 }
105 return "";
106}
107
108# get Movie Data
109sub getMovieData {
110 my ($movieid)=@_; # grab movieid parameter
111 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
112
113 my $name_link_pat = qr'<a href="/name/[^"]*">([^<]*)</a>'m;
114
115 # get the search results page
116 my $request = "http://www.imdb.com/title/tt" . $movieid . "/";
117 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
118 my $response = get $request;
119 if (defined $opt_r) { printf("%s", $response); }
120
121 # parse title and year
122 my $year = "";
123 my $title = parseBetween($response, "<title>", "</title>");
124 if ($title =~ m#(.+) \((\d+).*\)#) # Note some years have a /II after them?
125 {
126 $title = $1;
127 $year = $2;
128 }
129 elsif ($title =~ m#(.+) \(\?\?\?\?\)#)
130 {
131 $title = $1;
132 }
133
134 # parse director
135 my $data = parseBetween($response, ">Director:</h5>", "</div>");
136 if (!length($data)) {
137 $data = parseBetween($response, ">Directors:</h5>", "</div>");
138 }
139 my $director = join(",", ($data =~ m/$name_link_pat/g));
140
141 # parse writer
142 # (Note: this takes the 'first' writer, may want to include others)
143 $data = parseBetween($response, ">Writers <a href=\"/wga\">(WGA)</a>:</h5>", "</div>");
144 if (!length($data)) {
145 $data = parseBetween($response, ">Writer:</h5>", "</div>");
146 }
147 if (!length($data)) {
148 $data = parseBetween($response, ">Writers:</h5>", "</div>");
149 }
150 my $writer = join(",", ($data =~ m/$name_link_pat/g));
151
152 # parse plot
153 my $plot = parseBetween($response, ">Plot Outline:</h5> ", "</div>");
154 if (!$plot) {
155 $plot = parseBetween($response, ">Plot Summary:</h5> ", "</div>");
156 }
157
158 if ($plot) {
159 # replace name links in plot (example 0388795)
160 $plot =~ s/$name_link_pat/$1/g;
161
162 # replace title links
163 my $title_link_pat = qr!<a href="/title/[^"]*">([^<]*)</a>!m;
164 $plot =~ s/$title_link_pat/$1/g;
165
166 # plot ends at first remaining link
167 my $plot_end = index($plot, "<a ");
168 if ($plot_end != -1) {
169 $plot = substr($plot, 0, $plot_end);
170 }
171 $plot = trim($plot);
172 }
173
174 # parse user rating
175 my $userrating = parseBetween($response, ">User Rating:</b>", "> (");
176 $userrating = parseBetween($userrating, "<b>", "/");
177
178 # parse MPAA rating
179 my $ratingcountry = "USA";
180 my $movierating = trim(parseBetween($response, ">MPAA</a>:</h5>", "</div>"));
181 if (!$movierating) {
182 $movierating = parseBetween($response, ">Certification:</h5>", "</div>");
183 $movierating = parseBetween($movierating, "certificates=$ratingcountry",
184 "/a>");
185 $movierating = parseBetween($movierating, ">", "<");
186 }
187
188 # parse movie length
189 my $runtime = trim(parseBetween($response, ">Runtime:</h5>", " min"));
190 unless ($runtime =~ /^-?\d/) {
191 $runtime = trim(parseBetween($response, "USA:", " min"));
192 }
193
194 # parse cast
195 # Note: full cast would be from url:
196 # www.imdb.com/title/<movieid>/fullcredits
197 my $cast = "";
198 $data = parseBetween($response, "Cast overview, first billed only",
199 "/table>");
200 if ($data) {
201 $cast = join(',', ($data =~ m/$name_link_pat/g));
202 }
203
204
205 # parse genres
206 my $lgenres = "";
207 $data = parseBetween($response, "<h5>Genre:</h5>","</div>");
208 if ($data) {
209 my $genre_pat = qr'/Sections/Genres/(?:[a-z ]+/)*">([^<]+)<'im;
210 $lgenres = join(',', ($data =~ /$genre_pat/g));
211 }
212
213 # parse countries
214 $data = parseBetween($response, "Country:</h5>","</div>");
215 my $country_pat = qr'/Sections/Countries/[A-Z]+/">([^<]+)</a>'i;
216 my $lcountries = join(",", ($data =~ m/$country_pat/g));
217
218 # output fields (these field names must match what MythVideo is looking for)
219 print "Title:$title\n";
220 print "Year:$year\n";
221 print "Director:$director\n";
222 print "Plot:$plot\n";
223 print "UserRating:$userrating\n";
224 print "MovieRating:$movierating\n";
225 print "Runtime:$runtime\n";
226 print "Writers: $writer\n";
227 print "Cast: $cast\n";
228 print "Genres: $lgenres\n";
229 print "Countries: $lcountries\n";
230}
231
232# dump Movie Poster
233sub getMoviePoster {
234 my ($movieid)=@_; # grab movieid parameter
235 if (defined $opt_d) { printf("# looking for movie id: '%s'\n", $movieid);}
236
237 # get the search results page
238 my $request = "http://www.imdb.com/title/tt" . $movieid . "/posters";
239 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
240 my $response = get $request;
241 if (defined $opt_r) { printf("%s", $response); }
242
243 if (!defined $response) {return;}
244
245 my $uri = "";
246
247 # look for references to impawards.com posters - they are high quality
248 my $site = "http://www.impawards.com";
249 my $impsite = parseBetween($response, "<a href=\"".$site, "\">".$site);
250
251 # jersey girl fix
252 $impsite = parseBetween($response, "<a href=\"http://impawards.com","\">http://impawards.com") if ($impsite eq "");
253
254 if ($impsite) {
255 $impsite = $site . $impsite;
256
257 if (defined $opt_d) { print "# Searching for poster at: ".$impsite."\n"; }
258 my $impres = get $impsite;
259 if (defined $opt_d) { printf("# got %i bytes\n", length($impres)); }
260 if (defined $opt_r) { printf("%s", $impres); }
261
262 # making sure it isnt redirect
263 $uri = parseBetween($impres, "0;URL=..", "\">");
264 if ($uri ne "") {
265 if (defined $opt_d) { printf("# processing redirect to %s\n",$uri); }
266 # this was redirect
267 $impsite = $site . $uri;
268 $impres = get $impsite;
269 }
270
271 # do stuff normally
272 $uri = parseBetween($impres, "<img SRC=\"posters/", "\" ALT");
273 # uri here is relative... patch it up to make a valid uri
274 if (!($uri =~ /http:(.*)/ )) {
275 my $path = substr($impsite, 0, rindex($impsite, '/') + 1);
276 $uri = $path."posters/".$uri;
277 }
278 if (defined $opt_d) { print "# found ipmawards poster: $uri\n"; }
279 }
280
281 # try looking on nexbase
282 if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)nexbase/i) {
283 if ($1 ne "") {
284 if (defined $opt_d) { print "# found nexbase poster page: $1 \n"; }
285 my $cinres = get $1;
286 if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); }
287 if (defined $opt_r) { printf("%s", $cinres); }
288
289 if ($cinres =~ m/<a id="photo_url" href="([^"]*?)" ><\/a>/i) {
290 if (defined $opt_d) { print "# nexbase url retreived\n"; }
291 $uri = $1;
292 }
293 }
294 }
295
296 # try looking on cinemablend
297 if ($uri eq "" && $response =~ m/<a href="([^"]*)">([^"]*?)cinemablend/i) {
298 if ($1 ne "") {
299 if (defined $opt_d) { print "# found cinemablend poster page: $1 \n"; }
300 my $cinres = get $1;
301 if (defined $opt_d) { printf("# got %i bytes\n", length($cinres)); }
302 if (defined $opt_r) { printf("%s", $cinres); }
303
304 if ($cinres =~ m/<td align=center><img src="([^"]*?)" border=1><\/td>/i) {
305 if (defined $opt_d) { print "# cinemablend url retreived\n"; }
306 $uri = "http://www.cinemablend.com/".$1;
307 }
308 }
309 }
310
311 # if the impawards site attempt didn't give a filename grab it from imdb
312 if ($uri eq "") {
313 if (defined $opt_d) { print "# looking for imdb posters\n"; }
314 my $host = "http://posters.imdb.com/posters/";
315
316 $uri = parseBetween($response, $host, "\"><td><td><a href=\"");
317 if ($uri ne "") {
318 $uri = $host.$uri;
319 } else {
320 if (defined $opt_d) { print "# no poster found\n"; }
321 }
322 }
323
324
325
326 my @movie_titles;
327 my $found_low_res = 0;
328 my $k = 0;
329
330 # no poster found, take lowres image from imdb
331 if ($uri eq "") {
332 if (defined $opt_d) { print "# looking for lowres imdb posters\n"; }
333 my $host = "http://www.imdb.com/title/tt" . $movieid . "/";
334 $response = get $host;
335
336 # Better handling for low resolution posters
337 #
338 if ($response =~ m/<a name="poster".*<img.*src="([^"]*).*<\/a>/ig) {
339 if (defined $opt_d) { print "# found low res poster at: $1\n"; }
340 $uri = $1;
341 $found_low_res = 1;
342 } else {
343 if (defined $opt_d) { print "# no low res poster found\n"; }
344 $uri = "";
345 }
346
347 if (defined $opt_d) { print "# starting to look for movie title\n"; }
348
349 # get main title
350 if (defined $opt_d) { print "# Getting possible movie titles:\n"; }
351 $movie_titles[$k++] = parseBetween($response, "<title>", "<\/title>");
352 if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
353
354 # now we get all other possible movie titles and store them in the titles array
355 while($response =~ m/>([^>^\(]*)([ ]{0,1}\([^\)]*\)[^\(^\)]*[ ]{0,1}){0,1}\(informal title\)/g) {
356 $movie_titles[$k++] = trim($1);
357 if (defined $opt_d) { print "# Title: ".$movie_titles[$k-1]."\n"; }
358 }
359
360 }
361
362 print "$uri\n";
363}
364
365# dump Movie list: 1 entry per line, each line as 'movieid:Movie Title'
366sub getMovieList {
367 my ($filename, $options)=@_; # grab parameters
368
369 # If we wanted to inspect the file for any reason we can do that now
370
371 #
372 # Convert filename into a query string
373 # (use same rules that Metadata::guesTitle does)
374 my $query = $filename;
375 $query = uri_unescape($query); # in case it was escaped
376 # Strip off the file extension
377 if (rindex($query, '.') != -1) {
378 $query = substr($query, 0, rindex($query, '.'));
379 }
380 # Strip off anything following '(' - people use this for general comments
381 if (rindex($query, '(') != -1) {
382 $query = substr($query, 0, rindex($query, '('));
383 }
384 # Strip off anything following '[' - people use this for general comments
385 if (rindex($query, '[') != -1) {
386 $query = substr($query, 0, rindex($query, '['));
387 }
388
389 # IMDB searches do better if any trailing ,The is left off
390 $query =~ /(.*), The$/i;
391 if ($1) { $query = $1; }
392
393 # prepare the url
394 $query = uri_escape($query);
395 if (!$options) { $options = "" ;}
396 if (defined $opt_d) {
397 printf("# query: '%s', options: '%s'\n", $query, $options);
398 }
399
400 # get the search results page
401 # some known IMDB options are:
402 # type=[fuzy] looser search
403 # from_year=[int] limit matches to year (broken at imdb)
404 # to_year=[int] limit matches to year (broken at imdb)
405 # sort=[smart] ??
406 # tv=[no|both|only] limits between tv and movies (broken at imdb)
407 #$options = "tt=on;nm=on;mx=20"; # not exactly clear what these options do
408 my $request = "http://www.imdb.com/find?q=$query;$options";
409 if (defined $opt_d) { printf("# request: '%s'\n", $request); }
410 my $response = get $request;
411 if (defined $opt_r) {
412 print $response;
413 exit(0);
414 }
415
416 # check to see if we got a results page or a movie page
417 # looking for 'add=<movieid>" target=' which only exists
418 # in a movie description page
419 my $movienum = parseBetween($response, "add=", "\">");
420 if ($movienum) {
421 if (defined $opt_d) { printf("# redirected to movie page\n"); }
422 my $movietitle = parseBetween($response, "<title>", "</title>");
423 $movietitle =~ m#(.+) \((\d+)\)#;
424 $movietitle = $1;
425 print "$movienum:$movietitle\n";
426 exit(0);
427 }
428
429 # extract possible matches
430 # possible matches are grouped in several catagories:
431 # exact, partial, and approximate
432 my $popular_results = parseBetween($response, "<b>Popular Titles</b>",
433 "</p>");
434 my $exact_matches = parseBetween($response, "<b>Titles (Exact Matches)</b>",
435 "</p>");
436 my $partial_matches = parseBetween($response, "<b>Titles (Partial Matches)</b>",
437 "</p>");
438# my $approx_matches = parseBetween($response, "<b>Approximate Matches</b>",
439# "</ol>");
440 # parse movie list from matches
441 my $beg = "<td";
442 my $end = "</td";
443 my $count = 0;
444 my @movies;
445
446# my $data = $exact_matches.$partial_matches;
447 my $data = $popular_results.$exact_matches;
448 # resort to partial matches if no exact
449 if ($data eq "") { $data = $partial_matches; }
450 # resort to approximate matches if no exact or partial
451# if ($data eq "") { $data = $approx_matches; }
452 if ($data eq "") {
453 if (defined $opt_d) { printf("# no results\n"); }
454 return;
455 }
456 my $start = index($data, $beg);
457 my $finish = index($data, $end, $start);
458 my $year;
459 my $type;
460 my $title;
461 while ($start != -1 && $start < length($data)) {
462 $start += length($beg);
463 my $entry = substr($data, $start, $finish - $start);
464 $start = index($data, $beg, $finish + 1);
465 $finish = index($data, $end, $start);
466
467 my $title = "";
468 my $year = "";
469 my $type = "";
470 my $movienum = "";
471
472 my $link_end = "</a>";
473 $fl_end = index($entry, $link_end);
474 $fl_end += length($link_end);
475 my $lhs = substr($entry, 0, $fl_end);
476 my $rhs = substr($entry, $fl_end);
477
478 if ($lhs =~ m/<a href="\/title\/tt(\d+)\/.*\">(.+)<\/a>/i) {
479 $movienum = $1;
480 $title = $2;
481 } else {
482 if (defined $opt_d) {
483 print("Unrecognized entry format\n");
484 }
485 next;
486 }
487
488 if ($rhs =~ m/\((\d+)\) \((.+)\)/) {
489 $year = $1;
490 $type = $2;
491 } elsif ($rhs =~ m/\((\d+)\)/) {
492 $year = $1;
493 }
494
495 my $skip = 0;
496
497 # fix broken 'tv=no' option
498 if ($options =~ /tv=no/) {
499 if ($type eq "TV") {
500 if (defined $opt_d) {printf("# skipping TV program: %s\n", $title);}
501 $skip = 1;
502 }
503 }
504 if ($options =~ /tv=only/) {
505 if ($type eq "") {
506 if (defined $opt_d) {printf("# skipping Movie: %s\n", $title);}
507 $skip = 1;
508 }
509 }
510 # fix broken 'from_year=' option
511 if ($options =~ /from_year=(\d+)/) {
512 if ($year < $1) {
513 if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);}
514 $skip = 1;
515 }
516 }
517 # fix broken 'to_year=' option
518 if ($options =~ /to_year=(\d+)/) {
519 if ($year > $1) {
520 if (defined $opt_d) {printf("# skipping b/c of yr: %s\n", $title);}
521 $skip = 1;
522 }
523 }
524
525 # option to strip out videos (I think that's what '(V)' means anyway?)
526 if ($options =~ /video=no/) {
527 if ($type eq "V") {
528 if (defined $opt_d) {
529 printf("# skipping Video program: %s\n", $title);
530 }
531 $skip = 1;
532 }
533 }
534
535 # (always) strip out video game's (why does IMDB give these anyway?)
536 if ($type eq "VG") {
537 if (defined $opt_d) {printf("# skipping videogame: %s\n", $title);}
538 $skip = 1;
539 }
540
541 # add to array
542 if (!$skip) {
543 my $moviename = $title;
544 if ($year ne "") {
545 $moviename .= " ($year)";
546 }
547
548# $movies[$count++] = $movienum . ":" . $title;
549 $movies[$count++] = $movienum . ":" . $moviename;
550 }
551 }
552
553 # display array of values
554 for $movie (@movies) { print "$movie\n"; }
555}
556
557#
558# Main Program
559#
560
561# parse command line arguments
562getopts('ohrdivDMP');
563
564# print out info
565if (defined $opt_v) { version(); exit 1; }
566if (defined $opt_i) { info(); exit 1; }
567
568# print out usage if needed
569if (defined $opt_h || $#ARGV<0) { help(); }
570
571if (defined $opt_D) {
572 # take movieid from cmdline arg
573 $movieid = shift || die "Usage : $0 -D <movieid>\n";
574 getMovieData($movieid);
575}
576
577elsif (defined $opt_P) {
578 # take movieid from cmdline arg
579 $movieid = shift || die "Usage : $0 -P <movieid>\n";
580 getMoviePoster($movieid);
581}
582
583elsif (defined $opt_M) {
584 # take query from cmdline arg
585 $options = shift || die "Usage : $0 -M [options] <query>\n";
586 $query = shift;
587 if (!$query) {
588 $query = $options;
589 $options = "";
590 }
591 getMovieList($query, $options);
592}
593# vim: set expandtab ts=3 sw=3 :